|
from collections import deque |
|
from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple |
|
|
|
|
|
class SentenceDataset(object): |
|
""" |
|
When we mention previous sentence and next sentence, we don't mean exactly one sentence |
|
but rather a previous chunk and a next chunk. This can include one or more sentences and |
|
it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk |
|
This class is used to build a dataset at the sentence |
|
level. It takes as input all the tokenized sentences in the note. So the input is |
|
a list of lists where the outer list represents the sentences in the note and the inner list |
|
is a list of tokens in the sentence. It then returns a dataset where each sentence is |
|
concatenated with the previous and a next chunk. This is done so that when we build a model |
|
we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc |
|
will be computed and updated based on the current sentence. The previous and next chunks will |
|
only be used to add context. We could have different sizes of previous and next chunks |
|
depending on the position of the sentence etc. Essentially we build a sentence level dataset |
|
where we can also provide context to the sentence by including the previous and next chunks |
|
""" |
|
|
|
def __init__( |
|
self, |
|
max_tokens: int, |
|
max_prev_sentence_token: int, |
|
max_next_sentence_token: int, |
|
default_chunk_size: int, |
|
ignore_label: str |
|
) -> NoReturn: |
|
""" |
|
Set the maximum token length a given training example (sentence level) can have. |
|
That is the total length of the current sentence + previous chunk + next chunk |
|
We also set the the maximum length of the previous and next chunks. That is how many |
|
tokens can be in these chunks. However if the total length exceeds, tokens in the |
|
previous and next chunks will be dropped to ensure that the total length is < max_tokens |
|
The default chunk size ensures that the length of the chunks will be a minimum number of |
|
tokens based on the value passed. For example is default_chunk_size=10, the length |
|
of the previous chunks and next chunks will be at least 10 tokens. |
|
Args: |
|
max_tokens (int): maximum token length a given training example (sentence level) can have |
|
max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence |
|
(training/prediction example) in the note can have |
|
max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence |
|
(training/prediction example) in the note can have |
|
default_chunk_size (int): the training example will always include a chunk of this length |
|
as part of the previous and next chunks |
|
ignore_label (str): The label assigned to the previous and next chunks to distinguish |
|
from the current sentence |
|
""" |
|
self._id_num = None |
|
self._max_tokens = max_tokens |
|
self._max_prev_sentence_token = max_prev_sentence_token |
|
self._max_next_sentence_token = max_next_sentence_token |
|
self._default_chunk_size = default_chunk_size |
|
self._ignore_label = ignore_label |
|
|
|
@staticmethod |
|
def chunker( |
|
seq: Sequence[Mapping[str, Union[str, int]]], |
|
size: int |
|
) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]: |
|
""" |
|
Return chunks of the sequence. The size of each chunk will be based |
|
on the value passed to the size argument. |
|
Args: |
|
seq (Sequence): maximum token length a given training example (sentence level) can have |
|
size (int): The max chunk size for the chunks |
|
Return: |
|
(Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of |
|
the input sequence chunked version of the sequence |
|
|
|
""" |
|
return (seq[pos:pos + size] for pos in range(0, len(seq), size)) |
|
|
|
def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]: |
|
""" |
|
Go through all the sentences in the medical note and create a list of |
|
previous sentences. The output of this function will be a list of chunks |
|
where each index of the list contains the sentences (chunks) - (tokens) present before |
|
the sentence at that index in the medical note. For example prev_sent[0] will |
|
be empty since there is no sentence before the first sentence in the note |
|
prev_sent[1] will be equal to sent[0], that is the previous sentence of the |
|
second sentence will be the first sentence. We make use of deque, where we |
|
start to deque elements when it start to exceed max_prev_sentence_token. This |
|
list of previous sentences will be used to define the previous chunks |
|
Args: |
|
sent_tokens (Sequence[str]): Sentences in the note and |
|
each element of the list contains a |
|
list of tokens in that sentence |
|
Returns: |
|
previous_sentences (List[deque]): A list of deque objects where each index contains a |
|
list (queue) of previous tokens (chunk) with respect |
|
to the sentence represented by that index in the note |
|
""" |
|
previous_sentences = list() |
|
|
|
|
|
prev_sentence = deque(maxlen=self._max_prev_sentence_token) |
|
|
|
|
|
previous_sentences.append(prev_sentence.copy()) |
|
|
|
|
|
|
|
for sent_token in sent_tokens[:-1]: |
|
for token in sent_token: |
|
prev_sentence.append(token) |
|
|
|
|
|
|
|
previous_sentences.append(prev_sentence.copy()) |
|
|
|
return previous_sentences |
|
|
|
def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]: |
|
""" |
|
Go through all the sentences in the medical note and create a list of |
|
next sentences. The output of this function will be a list of lists |
|
where each index of the list contains the list of sentences present after |
|
the sentence at that index in the medical note. For example next_sent[-] will |
|
be empty since there is no sentence after the last sentence in the note |
|
next_sent[0] will be equal to sent[1:], that is the next sentence of the |
|
first sentence will be the subsequent sentences. We make use of deque, where we |
|
start to deque elements when it start to exceed max_next_sentence_token. This |
|
list of previous sentences will be used to define the previous chunks |
|
Args: |
|
sent_tokens (Sequence[str]): Sentences in the note and each |
|
element of the list contains a |
|
list of tokens in that sentence |
|
Returns: |
|
next_sentences (List[deque]): A list of deque objects where each index contains a list (queue) |
|
of next tokens (chunk) with respect to the sentence represented |
|
by that index in the note |
|
""" |
|
|
|
next_sentences = list() |
|
|
|
|
|
next_sentence = deque(maxlen=self._max_next_sentence_token) |
|
|
|
|
|
|
|
next_sentences.append(next_sentence.copy()) |
|
for sent_token in reversed(sent_tokens[1:]): |
|
for token in reversed(sent_token): |
|
next_sentence.appendleft(token) |
|
next_sentences.append(next_sentence.copy()) |
|
|
|
|
|
return [next_sent for next_sent in reversed(next_sentences)] |
|
|
|
def get_sentences( |
|
self, |
|
sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]], |
|
token_text_key: str = 'text', |
|
label_key: str = 'label', |
|
start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None, |
|
end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None, |
|
sub: bool = False |
|
) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]: |
|
""" |
|
When we mention previous sentence and next sentence, we don't mean exactly one sentence |
|
but rather a previous chunk and a next chunk. This can include one or more sentences and |
|
it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk |
|
We iterate through all the tokenized sentences in the note. So the input is |
|
a list of lists where the outer list represents the sentences in the note and the inner list |
|
is a list of tokens in the sentence. It then returns a dataset where each sentence is |
|
concatenated with the previous and the next sentence. This is done so that when we build a model |
|
we can use the previous and next sentence to add context to the model. The weights and loss etc |
|
will be computed and updated based on the current sentence. The previous and next sentence will |
|
only be used to add context. We could have different sizes of previous and next chunks |
|
depending on the position of the sentence etc. Since we split a note in several sentences which are |
|
then used as training data. |
|
ignore_label is used to differentiate between the current sentence and the previous and next |
|
chunks. The chunks will have the label NA so that and the current sentence |
|
will have the label (DATE, AGE etc) so that they can be distinguished. |
|
If however we are building a dataset for predictions |
|
the current sentence will have the default label O, but the next and previous chunks will still |
|
have the label NA. However if the total length exceeds, tokens in the |
|
previous and next chunks will be dropped to ensure that the total length is < max_tokens |
|
The default chunk size ensures that the length of the chunks will be a minimum number of |
|
tokens based on the value passed. For example is default_chunk_size=10, the length |
|
of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens |
|
even after decreasing the sizes of the previous and next chunks, then we split this long |
|
sentence into sub sentences and repeat the process described above. |
|
Args: |
|
sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence |
|
contains the tokens (dict) in that sentence |
|
the token dict object contains the |
|
token text, start, end etc |
|
token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text |
|
key to extract the text of the token from the dictionary |
|
label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key |
|
key to extract the label of the token from the dictionary. (if it does not have a label |
|
the default label will be assigned) |
|
start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some |
|
pre-defined chunk |
|
end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some |
|
pre-defined chunk |
|
sub (bool): Whether the function is called to process sub-sentences (used when we are splitting |
|
long sentences into smaller sub sentences to keep sentence length < max_tokens |
|
Returns: |
|
(Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the |
|
returned sentences, |
|
where each sentence |
|
has the previous |
|
chunks and next |
|
chunks attached |
|
to it. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not sub: |
|
self._id_num = -1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
previous_sentences = self.get_previous_sentences(sent_tokens) |
|
|
|
next_sentences = self.get_next_sentences(sent_tokens) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences): |
|
raise ValueError('Sentence length mismatch') |
|
for index, (previous_sent, current_sent, next_sent) in enumerate( |
|
zip(previous_sentences, sent_tokens, next_sentences)): |
|
sent_tokens_text = list() |
|
sent_labels = list() |
|
sent_toks = list() |
|
|
|
for token in current_sent: |
|
|
|
sent_toks.append(token) |
|
sent_tokens_text.append(token[token_text_key]) |
|
sent_labels.append(token[label_key]) |
|
|
|
|
|
|
|
previous_sent_length = len(previous_sent) |
|
current_sent_length = len(sent_tokens_text) |
|
next_sent_length = len(next_sent) |
|
total_length = previous_sent_length + current_sent_length + next_sent_length |
|
|
|
|
|
|
|
|
|
while total_length > self._max_tokens and \ |
|
(next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size): |
|
if next_sent_length >= previous_sent_length: |
|
next_sent.pop() |
|
next_sent_length -= 1 |
|
total_length -= 1 |
|
elif previous_sent_length > next_sent_length: |
|
previous_sent.popleft() |
|
previous_sent_length -= 1 |
|
total_length -= 1 |
|
|
|
|
|
|
|
|
|
if not sub: |
|
self._id_num += 1 |
|
|
|
|
|
if total_length <= self._max_tokens: |
|
|
|
if index == 0 and start_chunk is not None: |
|
previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \ |
|
[prev_token[token_text_key] for prev_token in list(previous_sent)] |
|
else: |
|
previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)] |
|
|
|
if index == len(sent_tokens) - 1 and end_chunk is not None: |
|
next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \ |
|
[chunk[token_text_key] for chunk in end_chunk] |
|
else: |
|
next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] |
|
previous_sent_length = len(previous_sent_tokens) |
|
next_sent_length = len(next_sent_tokens) |
|
|
|
|
|
|
|
|
|
previous_sent_labels = list() |
|
next_sent_labels = list() |
|
if self._ignore_label == 'NA': |
|
previous_sent_labels = [self._ignore_label] * previous_sent_length |
|
next_sent_labels = [self._ignore_label] * next_sent_length |
|
elif self._ignore_label == 'label': |
|
if index == 0 and start_chunk is not None: |
|
previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \ |
|
[prev_token[label_key] for prev_token in list(previous_sent)] |
|
else: |
|
previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)] |
|
if index == len(sent_tokens) - 1 and end_chunk is not None: |
|
next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \ |
|
[chunk[label_key] for chunk in end_chunk] |
|
else: |
|
next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] |
|
|
|
|
|
tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens |
|
labels_data = previous_sent_labels + sent_labels + next_sent_labels |
|
|
|
yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent} |
|
|
|
|
|
|
|
|
|
else: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sub_sentences = list() |
|
|
|
previous_sent_tokens = list(previous_sent) |
|
|
|
next_sent_tokens = list(next_sent) |
|
|
|
for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)): |
|
sub_sentences.append(chunk) |
|
|
|
for sub_sent in self.get_sentences( |
|
sub_sentences, |
|
token_text_key, |
|
label_key, |
|
start_chunk=previous_sent_tokens, |
|
end_chunk=next_sent_tokens, |
|
sub=True |
|
): |
|
yield sub_sent |
|
|