Prajwal Kailas
dependency to run
45c1511
raw
history blame
24.5 kB
from collections import deque
from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple
class SentenceDataset(object):
"""
When we mention previous sentence and next sentence, we don't mean exactly one sentence
but rather a previous chunk and a next chunk. This can include one or more sentences and
it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
This class is used to build a dataset at the sentence
level. It takes as input all the tokenized sentences in the note. So the input is
a list of lists where the outer list represents the sentences in the note and the inner list
is a list of tokens in the sentence. It then returns a dataset where each sentence is
concatenated with the previous and a next chunk. This is done so that when we build a model
we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc
will be computed and updated based on the current sentence. The previous and next chunks will
only be used to add context. We could have different sizes of previous and next chunks
depending on the position of the sentence etc. Essentially we build a sentence level dataset
where we can also provide context to the sentence by including the previous and next chunks
"""
def __init__(
self,
max_tokens: int,
max_prev_sentence_token: int,
max_next_sentence_token: int,
default_chunk_size: int,
ignore_label: str
) -> NoReturn:
"""
Set the maximum token length a given training example (sentence level) can have.
That is the total length of the current sentence + previous chunk + next chunk
We also set the the maximum length of the previous and next chunks. That is how many
tokens can be in these chunks. However if the total length exceeds, tokens in the
previous and next chunks will be dropped to ensure that the total length is < max_tokens
The default chunk size ensures that the length of the chunks will be a minimum number of
tokens based on the value passed. For example is default_chunk_size=10, the length
of the previous chunks and next chunks will be at least 10 tokens.
Args:
max_tokens (int): maximum token length a given training example (sentence level) can have
max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence
(training/prediction example) in the note can have
max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence
(training/prediction example) in the note can have
default_chunk_size (int): the training example will always include a chunk of this length
as part of the previous and next chunks
ignore_label (str): The label assigned to the previous and next chunks to distinguish
from the current sentence
"""
self._id_num = None
self._max_tokens = max_tokens
self._max_prev_sentence_token = max_prev_sentence_token
self._max_next_sentence_token = max_next_sentence_token
self._default_chunk_size = default_chunk_size
self._ignore_label = ignore_label
@staticmethod
def chunker(
seq: Sequence[Mapping[str, Union[str, int]]],
size: int
) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]:
"""
Return chunks of the sequence. The size of each chunk will be based
on the value passed to the size argument.
Args:
seq (Sequence): maximum token length a given training example (sentence level) can have
size (int): The max chunk size for the chunks
Return:
(Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of
the input sequence chunked version of the sequence
"""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
"""
Go through all the sentences in the medical note and create a list of
previous sentences. The output of this function will be a list of chunks
where each index of the list contains the sentences (chunks) - (tokens) present before
the sentence at that index in the medical note. For example prev_sent[0] will
be empty since there is no sentence before the first sentence in the note
prev_sent[1] will be equal to sent[0], that is the previous sentence of the
second sentence will be the first sentence. We make use of deque, where we
start to deque elements when it start to exceed max_prev_sentence_token. This
list of previous sentences will be used to define the previous chunks
Args:
sent_tokens (Sequence[str]): Sentences in the note and
each element of the list contains a
list of tokens in that sentence
Returns:
previous_sentences (List[deque]): A list of deque objects where each index contains a
list (queue) of previous tokens (chunk) with respect
to the sentence represented by that index in the note
"""
previous_sentences = list()
# Create a queue and specify the capacity of the queue
# Tokens will be popped from the queue when the capacity is exceeded
prev_sentence = deque(maxlen=self._max_prev_sentence_token)
# The first previous chunk is empty since the first sentence in the note does not have
# anything before it
previous_sentences.append(prev_sentence.copy())
# As we iterate through the list of sentences in the not, we add the tokens from the previous chunks
# to the the queue. Since we have a queue, as soon as the capacity is exceeded we pop tokens from
# the queue
for sent_token in sent_tokens[:-1]:
for token in sent_token:
prev_sentence.append(token)
# As soon as each sentence in the list is processed
# We add a copy of the current queue to a list - this list keeps track of the
# previous chunks for a sentence
previous_sentences.append(prev_sentence.copy())
return previous_sentences
def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
"""
Go through all the sentences in the medical note and create a list of
next sentences. The output of this function will be a list of lists
where each index of the list contains the list of sentences present after
the sentence at that index in the medical note. For example next_sent[-] will
be empty since there is no sentence after the last sentence in the note
next_sent[0] will be equal to sent[1:], that is the next sentence of the
first sentence will be the subsequent sentences. We make use of deque, where we
start to deque elements when it start to exceed max_next_sentence_token. This
list of previous sentences will be used to define the previous chunks
Args:
sent_tokens (Sequence[str]): Sentences in the note and each
element of the list contains a
list of tokens in that sentence
Returns:
next_sentences (List[deque]): A list of deque objects where each index contains a list (queue)
of next tokens (chunk) with respect to the sentence represented
by that index in the note
"""
# A list of next sentences is first created and reversed
next_sentences = list()
# Create a queue and specify the capacity of the queue
# Tokens will be popped from the queue when the capacity is exceeded
next_sentence = deque(maxlen=self._max_next_sentence_token)
# The first (which becomes the last chunk when we reverse this list) next chunk is empty since
# the last sentence in the note does not have
# anything after it
next_sentences.append(next_sentence.copy())
for sent_token in reversed(sent_tokens[1:]):
for token in reversed(sent_token):
next_sentence.appendleft(token)
next_sentences.append(next_sentence.copy())
# The list is reversed - since we went through the sentences in the reverse order in
# the earlier steps
return [next_sent for next_sent in reversed(next_sentences)]
def get_sentences(
self,
sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]],
token_text_key: str = 'text',
label_key: str = 'label',
start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
sub: bool = False
) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]:
"""
When we mention previous sentence and next sentence, we don't mean exactly one sentence
but rather a previous chunk and a next chunk. This can include one or more sentences and
it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
We iterate through all the tokenized sentences in the note. So the input is
a list of lists where the outer list represents the sentences in the note and the inner list
is a list of tokens in the sentence. It then returns a dataset where each sentence is
concatenated with the previous and the next sentence. This is done so that when we build a model
we can use the previous and next sentence to add context to the model. The weights and loss etc
will be computed and updated based on the current sentence. The previous and next sentence will
only be used to add context. We could have different sizes of previous and next chunks
depending on the position of the sentence etc. Since we split a note in several sentences which are
then used as training data.
ignore_label is used to differentiate between the current sentence and the previous and next
chunks. The chunks will have the label NA so that and the current sentence
will have the label (DATE, AGE etc) so that they can be distinguished.
If however we are building a dataset for predictions
the current sentence will have the default label O, but the next and previous chunks will still
have the label NA. However if the total length exceeds, tokens in the
previous and next chunks will be dropped to ensure that the total length is < max_tokens
The default chunk size ensures that the length of the chunks will be a minimum number of
tokens based on the value passed. For example is default_chunk_size=10, the length
of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens
even after decreasing the sizes of the previous and next chunks, then we split this long
sentence into sub sentences and repeat the process described above.
Args:
sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence
contains the tokens (dict) in that sentence
the token dict object contains the
token text, start, end etc
token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text
key to extract the text of the token from the dictionary
label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key
key to extract the label of the token from the dictionary. (if it does not have a label
the default label will be assigned)
start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some
pre-defined chunk
end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some
pre-defined chunk
sub (bool): Whether the function is called to process sub-sentences (used when we are splitting
long sentences into smaller sub sentences to keep sentence length < max_tokens
Returns:
(Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the
returned sentences,
where each sentence
has the previous
chunks and next
chunks attached
to it.
"""
# Id num keeps track of the id of the sentence - that is the position the sentence occurs in
# the note. We keep the id of sub sentences the same as the sentence, so that the user
# knows that these sub sentences are chunked from a longer sentence.
# <SENT 0> <SENT 1>. Say length of sent 0 with the previous and next chunks is less than max_tokens
# we return sent 0 with id 0. For sent 1, say the length is longer, we split it into sub
# sentences - <SUB 1><SUB 2> - we return SUB 1, and SUB 2 with id 1 - so we know that it belongs
# to <SENT 1> in the note.
if not sub:
self._id_num = -1
# Initialize the object that will take all the sentences in the note and return
# a dataset where each row represents a sentence in the note. The sentence in each
# row will also contain a previous chunk and next chunk (tokens) that will act as context
# when training the mode
# [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
# which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
# provide context to the current sentence
# Get the previous sentences (chunks) for each sentence in the note
previous_sentences = self.get_previous_sentences(sent_tokens)
# Get the next sentences (chunks) for each sentence in the note
next_sentences = self.get_next_sentences(sent_tokens)
# For the note we are going to iterate through all the sentences in the note and
# concatenate each sentence with the previous and next chunks. (This forms the data that
# will be used for training/predictions) Each sentence with the concatenated chunks will be
# a training sample. We would do the same thing for getting predictions on a sentence as well
# The only difference would be the labels that are used. We would use the default label O for
# prediction and the annotated labels for prediction
if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences):
raise ValueError('Sentence length mismatch')
for index, (previous_sent, current_sent, next_sent) in enumerate(
zip(previous_sentences, sent_tokens, next_sentences)):
sent_tokens_text = list()
sent_labels = list()
sent_toks = list()
# Get the tokens and labels for the current sentence
for token in current_sent:
# We store this, if we need to process sub sentences when a sentence exceeds max_tokens
sent_toks.append(token)
sent_tokens_text.append(token[token_text_key])
sent_labels.append(token[label_key])
# We check if the number of tokens in teh current sentence + previous chunk
# + next chunk exceeds max tokens. If it does we start popping tokens from the previous and next chunks
# until the number of tokens is equal to max tokens
previous_sent_length = len(previous_sent)
current_sent_length = len(sent_tokens_text)
next_sent_length = len(next_sent)
total_length = previous_sent_length + current_sent_length + next_sent_length
# If the length of the current sentence plus the length of the previous and next
# chunks exceeds the max_tokens, start popping tokens from the previous and next
# chunks until either total length < max_tokens or the number of tokens in the previous and
# next chunks goes below the default chunk size
while total_length > self._max_tokens and \
(next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size):
if next_sent_length >= previous_sent_length:
next_sent.pop()
next_sent_length -= 1
total_length -= 1
elif previous_sent_length > next_sent_length:
previous_sent.popleft()
previous_sent_length -= 1
total_length -= 1
# If this is not a sub sentence, increment the ID to
# indicate the processing of the next sentence of the note
# If it is a sub sentence, keep the ID the same, to indicate
# it belongs to a larger sentence
if not sub:
self._id_num += 1
# If total length < max_tokens - process the sentence with the current sentence
# and add on the previous and next chunks and return
if total_length <= self._max_tokens:
# Check if we want to add a pre-defined chunk for the first sentence in the note
if index == 0 and start_chunk is not None:
previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \
[prev_token[token_text_key] for prev_token in list(previous_sent)]
else:
previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)]
# Check if we want to add a pre-defined chunk for the last sentence in the note
if index == len(sent_tokens) - 1 and end_chunk is not None:
next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \
[chunk[token_text_key] for chunk in end_chunk]
else:
next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)]
previous_sent_length = len(previous_sent_tokens)
next_sent_length = len(next_sent_tokens)
# Store information about the current sentence - start and end pos etc
# this can be used to distinguish from the next and previous chunks
# current_sent_info = {'token_info':current_sent}
# Assign an different label (the ignore label) to the chunks - since they are used only for context
previous_sent_labels = list()
next_sent_labels = list()
if self._ignore_label == 'NA':
previous_sent_labels = [self._ignore_label] * previous_sent_length
next_sent_labels = [self._ignore_label] * next_sent_length
elif self._ignore_label == 'label':
if index == 0 and start_chunk is not None:
previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \
[prev_token[label_key] for prev_token in list(previous_sent)]
else:
previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)]
if index == len(sent_tokens) - 1 and end_chunk is not None:
next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \
[chunk[label_key] for chunk in end_chunk]
else:
next_sent_labels = [next_token[label_key] for next_token in list(next_sent)]
# Concatenate the chunks and the sentence
# sent_tokens_text.append(token[token_text_key])
tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens
labels_data = previous_sent_labels + sent_labels + next_sent_labels
# Return processed sentences
yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent}
# Process the sub sentences - we take a long sentence
# and split it into smaller chunks - and we recursively call the function on this list
# of smaller chunks - as mentioned before the smaller chunks (sub sentences) will have the
# same ID as the original sentence
else:
# Store the smaller chunks - say <SENT1> is too long
# <PREV CHUNK><SENT1><NEXT CHUNK>
# We get chunk sent 1 - to <SUB1><SUB2><SUB3> and we pass this [<SUB1><SUB2><SUB3>] to the function
# as a recursive call. This list is now processed as a smaller note that essentially belongs
# to a sentence. But as you can see we did not pass <PREV CHUNK> & <NEXT CHUNK>, because
# these are chunks that are not part of the current sentence, but they still need to be
# included in the final output - and the work around is mentioned below
# So that we have a previous chunk for <SUB1> and next chunk for <SUB3>
# we include the previous_sent_tokens and next_sent_tokens as the start chunk
# and the next chunk in the function call below
# <PREV CHUNK><SUB1><NEXT SUB1>, id = x
# <PREV SUB2><SUB2><NEXT SUB2>, id = x
# <PREV SUB3><SUB3><NEXT CHUNK>, id = x
sub_sentences = list()
# Prefix the first sentence in these smaller chunks
previous_sent_tokens = list(previous_sent)
# Suffix the last sentence in these smaller chunks
next_sent_tokens = list(next_sent)
# Get chunks
for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)):
sub_sentences.append(chunk)
# Process list of smaller chunks
for sub_sent in self.get_sentences(
sub_sentences,
token_text_key,
label_key,
start_chunk=previous_sent_tokens,
end_chunk=next_sent_tokens,
sub=True
):
yield sub_sent