File size: 6,725 Bytes
45c1511
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import random
import re
from typing import Iterable, Dict, Sequence, Union, Mapping, Optional, List

from .labels import NERTokenLabels, NERPredictTokenLabels, MismatchError

random.seed(41)


class Dataset(object):
    """
    Build a NER token classification dataset. Each token should have a corresponding label
    based on the annotated spans
    For training we will build the dataset using the annotated spans (e.g from prodigy)
    For predictions we will assign default labels. to keep the format of the dataset the same
    The dataset is on a sentence level, i.e each note is split into sentences and the
    task is run on a sentence level. Even the predictions are run on a sentence level
    The dataset would be something like:
    Tokens: [tok1, tok2, ... tok n]
    Labels: [lab1, lab2, ... lab n]
    For the prediction mode the labels would be: [default, default, default .... default]
    This script can also be used for predictions, the Labels will be filled with some
    default value. This is done so that we can use the same script for building a dataset to train a model
    and a dataset to obtain predictions using a model
    """

    def __init__(
            self,
            sentencizer,
            tokenizer
    ):
        """
        Build a NER token classification dataset
        For training we will build the dataset using the annotated spans (e.g from prodigy)
        For predictions we will assign default labels.
        The dataset is on a sentence level, i.e each note is split into sentences and the de-id
        task is run on a sentence level. Even the predictions are run on a sentence level
        The dataset would be something like:
        Tokens: [tok1, tok2, ... tok n]
        Labels: [lab1, lab2, ... lab n]
        This script can also be used for predictions, the Labels will be filled with some
        default value. This is done so that we can use the same script for building a dataset to train a model
        and a dataset to obtain predictions using a model 
        Args:
            sentencizer (Union[SpacySentencizer, MimicStanzaSentencizer, NoteSentencizer]): The sentencizer to use for 
                                                                                            splitting notes into
                                                                                            sentences
            tokenizer (Union[ClinicalSpacyTokenizer, SpacyTokenizer, CoreNLPTokenizer]): The tokenizer to use for
                                                                                         splitting text into tokens
        """
        self._sentencizer = sentencizer
        self._tokenizer = tokenizer

    def get_tokens(
            self,
            text: str,
            spans: Optional[List[Mapping[str, Union[str, int]]]] = None,
            notation: str = 'BIO',
            token_text_key: str = 'text',
            label_key: str = 'label'
    ) -> Iterable[Sequence[Dict[str, Union[str, int]]]]:
        """
        Get a nested list of tokens where the the inner list represents the tokens in the
        sentence and the outer list will contain all the sentences in the note
        Args:
            text (str): The text present in the note
            spans (Optional[List[Mapping[str, Union[str, int]]]]): The NER spans in the note. This will be none if
                                                                   building the dataset for prediction
            notation (str): The notation we will be using for the label scheme (e.g BIO, BILOU etc)
            token_text_key (str): The key where the note text is present
            label_key (str): The key where the note label for each token is present
        Returns:
            Iterable[Sequence[Dict[str, Union[str, int]]]]: Iterable that iterates through all the sentences 
                                                            and yields the list of tokens in each sentence
        """
        # Initialize the object that will be used to align tokens and spans based on the notation
        # as mentioned earlier - this will be used only when mode is train - because we have
        # access to labelled spans for the notes
        if spans is None:
            label_spans = NERPredictTokenLabels('O')
        else:
            label_spans = NERTokenLabels(spans=spans, notation=notation)
        # Iterate through the sentences in the note
        for sentence in self._sentencizer.get_sentences(text=text):
            # This is used to determine the position of the tokens with respect to the entire note
            offset = sentence['start']
            # Keeps track of the tokens in the sentence
            tokens = list()
            for token in self._tokenizer.get_tokens(text=sentence['text']):
                # Get the token position (start, end) in the note
                token['start'] += offset
                token['end'] += offset
                if token[token_text_key].strip() in ['\n', '\t', ' ', ''] or token['start'] == token['end']:
                    continue
                # Shorten consecutive sequences of special characters, this can prevent BERT from truncating
                # extremely long sentences - that could arise because of these characters
                elif re.search('(\W|_){9,}', token[token_text_key]):
                    print('WARNING - Shortening a long sequence of special characters from {} to 8'.format(
                        len(token[token_text_key])))
                    token[token_text_key] = re.sub('(?P<specchar>(\W|_)){8,}', '\g<specchar>' * 8,
                                                   token[token_text_key])
                elif len(token[token_text_key].split(' ')) != 1:
                    print('WARNING - Token contains a space character - will be replaced with hyphen')
                    token[token_text_key] = token[token_text_key].replace(' ', '-')
                # Get the labels for each token based on the notation (BIO)
                # In predict mode - the default label (e.g O) will be assigned
                try:
                    # Get the label for the token - based on the notation
                    label = label_spans.get_labels(token=token)
                    if label[2:] == 'OTHERISSUE':
                        raise ValueError('Fix OTHERISSUE spans')
                # Check if there is a token and span mismatch, i.e the token and span does not align
                except MismatchError:
                    print(token)
                    raise ValueError('Token-Span mismatch')
                token[label_key] = label
                tokens.append(token)
            if tokens:
                yield tokens