|
import json |
|
import random |
|
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter |
|
from typing import Iterable, Dict, List, Union, Optional, Sequence, NoReturn |
|
|
|
from .dataset_builder import Dataset, SentenceDataset |
|
from .preprocessing import PreprocessingLoader |
|
|
|
random.seed(41) |
|
|
|
|
|
class DatasetCreator(object): |
|
""" |
|
Build a NER token classification dataset |
|
For training we will build the dataset using the annotated spans (e.g from prodigy) |
|
For predictions we will assign default labels. |
|
The dataset is on a sentence level, i.e each note is split into sentences and the de-id |
|
task is run on a sentence level. Even the predictions are run on a sentence level |
|
The dataset would be something like: |
|
Tokens: [[tok1, tok2, ... tok-n], [tok ...], ..., [tok ...]] |
|
Labels: [[lab1, lab2, ... lab-n], [lab ...], ..., [lab ...]] |
|
Where the inner list represents the sentences - the tokens in the sentence and the respective |
|
labels for each token. The labels depend on the notation |
|
This script can also be used for predictions, the Labels will be filled with some |
|
default value. This is done so that we can use the same script for building a dataset to train a model |
|
and a dataset to obtain predictions using a model |
|
Example: |
|
Note: Bruce Wayne is a 60yo man. He lives in Gotham |
|
Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham] |
|
Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, .], [He, lives, in, Gotham]] |
|
Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O], [O, O, O, B-LOC]] |
|
Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O], [O, O, O, U-LOC]] |
|
We also can create sentences that uses previous/next chunks as context - in this case the dataset would |
|
look something like this. (Assume we limit the size of the chunks to 3 tokens) |
|
Sentences: [Bruce Wayne Jr is a 60yo man., He lives in Gotham] |
|
Tokens: [[Bruce, Wayne, Jr, is, a, 60, yo, man, ., He, lives, in], [yo, man, ., He, lives, in, Gotham]] |
|
Labels (BIO notation): [[B-Name, I-Name, I-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, B-LOC]] |
|
Labels (BILOU notation): [[B-Name, I-Name, L-Name, O, O, O, O, O, O, NA, NA, NA], [NA, NA, NA, O, O, O, U-LOC]] |
|
NA represents the token is used for context |
|
""" |
|
|
|
def __init__( |
|
self, |
|
sentencizer: str, |
|
tokenizer: str, |
|
abbreviations: Optional[Sequence[str]] = None, |
|
max_tokens: int = 128, |
|
max_prev_sentence_token: int = 32, |
|
max_next_sentence_token: int = 32, |
|
default_chunk_size: int = 32, |
|
ignore_label: str = 'NA' |
|
) -> NoReturn: |
|
""" |
|
Initialize the sentencizer and tokenizer |
|
Args: |
|
sentencizer (str): Specify which sentencizer you want to use |
|
tokenizer (str): Specify which tokenizer you want to use |
|
abbreviations (Optional[Sequence[str]]): A list of abbreviations for which tokens will not be split |
|
- works only with with custom clinical tokenizer. |
|
max_tokens (int): The maximum number of tokens allowed in a sentence/training example, |
|
truncate if it exceeds. |
|
max_prev_sentence_token (int): The maximum number of previous chunk tokens allowed in a |
|
sentence/training example |
|
max_next_sentence_token (int): The maximum number of next chunk tokens allowed in a |
|
sentence/training example. |
|
ignore_label (str): The label assigned to the previous and next chunks to distinguish |
|
from the current sentence |
|
""" |
|
self._sentencizer = PreprocessingLoader.get_sentencizer(sentencizer=sentencizer) |
|
self._tokenizer = PreprocessingLoader.get_tokenizer(tokenizer=tokenizer, abbreviations=abbreviations) |
|
|
|
self._dataset = Dataset(sentencizer=self._sentencizer, tokenizer=self._tokenizer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self._sentence_dataset = SentenceDataset( |
|
max_tokens=max_tokens, |
|
max_prev_sentence_token=max_prev_sentence_token, |
|
max_next_sentence_token=max_next_sentence_token, |
|
default_chunk_size=default_chunk_size, |
|
ignore_label=ignore_label |
|
) |
|
|
|
def create( |
|
self, |
|
input_file: str, |
|
mode: str = 'predict', |
|
notation: str = 'BIO', |
|
token_text_key: str = 'text', |
|
metadata_key: str = 'meta', |
|
note_id_key: str = 'note_id', |
|
label_key: str = 'labels', |
|
span_text_key: str = 'spans' |
|
) -> Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]: |
|
""" |
|
This function is used to get the sentences that will be part of the NER dataset. |
|
We check whether the note belongs to the desired dataset split. If it does, |
|
we fix any spans that can cause token-span alignment errors. Then we extract |
|
all the sentences in the notes, the tokens in each sentence. Finally we |
|
add some context tokens to the sentence if required. This function returns |
|
an iterable that iterated through each of the processed sentences |
|
Args: |
|
input_file (str): Input jsonl file. Make sure the spans are in ascending order (based on start position) |
|
mode (str): Dataset being built for train or predict. |
|
notation (str): The NER labelling notation |
|
token_text_key (str): The key where the note text and token text is present in the json object |
|
metadata_key (str): The key where the note metadata is present in the json object |
|
note_id_key (str): The key where the note id is present in the json object |
|
label_key (str): The key where the token label will be stored in the json object |
|
span_text_key (str): The key where the note spans is present in the json object |
|
Returns: |
|
(Iterable[Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]): Iterate through the processed |
|
sentences/training examples |
|
""" |
|
|
|
for line in open(input_file, 'r'): |
|
note = json.loads(line) |
|
note_text = note[token_text_key] |
|
note_id = note[metadata_key][note_id_key] |
|
if mode == 'train': |
|
note_spans = note[span_text_key] |
|
|
|
elif mode == 'predict': |
|
note_spans = None |
|
else: |
|
raise ValueError("Invalid mode - can only be train/predict") |
|
|
|
|
|
|
|
|
|
|
|
|
|
sent_tokens = [sent_tok for sent_tok in self._dataset.get_tokens( |
|
text=note_text, |
|
spans=note_spans, |
|
notation=notation |
|
)] |
|
|
|
|
|
|
|
|
|
|
|
for ner_sent_index, ner_sentence in self._sentence_dataset.get_sentences( |
|
sent_tokens=sent_tokens, |
|
token_text_key=token_text_key, |
|
label_key=label_key |
|
): |
|
|
|
|
|
current_sent_info = ner_sentence['current_sent_info'] |
|
note_sent_info_store = {'start': current_sent_info[0]['start'], |
|
'end': current_sent_info[-1]['end'], 'note_id': note_id} |
|
ner_sentence['note_sent_info'] = note_sent_info_store |
|
yield ner_sentence |
|
|
|
|
|
def main(): |
|
cli_parser = ArgumentParser( |
|
description='configuration arguments provided at run time from the CLI', |
|
formatter_class=ArgumentDefaultsHelpFormatter |
|
) |
|
cli_parser.add_argument( |
|
'--input_file', |
|
type=str, |
|
required=True, |
|
help='the the jsonl file that contains the notes. spans need to be sorted in ascending order (based on start ' |
|
'position) ' |
|
) |
|
cli_parser.add_argument( |
|
'--notation', |
|
type=str, |
|
default='BIO', |
|
help='the notation we will be using for the label scheme' |
|
) |
|
cli_parser.add_argument( |
|
'--max_tokens', |
|
type=int, |
|
default=128, |
|
help='The max tokens that a given sentence (training/prediction example) in the note can have' |
|
) |
|
cli_parser.add_argument( |
|
'--default_chunk_size', |
|
type=int, |
|
default=32, |
|
help='the default chunk size for the previous and next chunks for a given sentence (training/prediction ' |
|
'example) in the note can have ' |
|
) |
|
cli_parser.add_argument( |
|
'--max_prev_sentence_token', |
|
type=int, |
|
default=32, |
|
help='the max chunk size for the previous chunks for a given sentence (training/prediction example) in the ' |
|
'note can have ' |
|
) |
|
cli_parser.add_argument( |
|
'--max_next_sentence_token', |
|
type=int, |
|
default=32, |
|
help='the max chunk size for the next chunks for a given sentence (training/prediction example) in the note ' |
|
'can have ' |
|
) |
|
cli_parser.add_argument( |
|
'--mode', |
|
type=str, |
|
choices=['train', 'predict'], |
|
required=True, |
|
help='whether we are building the dataset for training or prediction' |
|
) |
|
cli_parser.add_argument( |
|
'--sentencizer', |
|
type=str, |
|
required=True, |
|
help='the sentencizer to use for splitting notes into sentences' |
|
) |
|
cli_parser.add_argument( |
|
'--tokenizer', |
|
type=str, |
|
required=True, |
|
help='the tokenizer to use for splitting text into tokens' |
|
) |
|
cli_parser.add_argument( |
|
'--abbreviations', |
|
type=str, |
|
default=None, |
|
help='file that will be used by clinical tokenizer to handle abbreviations' |
|
) |
|
cli_parser.add_argument( |
|
'--ignore_label', |
|
type=str, |
|
default='NA', |
|
help='whether to use the ignore label or not' |
|
) |
|
cli_parser.add_argument( |
|
'--token_text_key', |
|
type=str, |
|
default='text', |
|
help='the key where the note text is present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--metadata_key', |
|
type=str, |
|
default='meta', |
|
help='the key where the note metadata is present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--note_id_key', |
|
type=str, |
|
default='note_id', |
|
help='the key where the note metadata is present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--label_key', |
|
type=str, |
|
default='label', |
|
help='the key where the note label for each token is present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--span_text_key', |
|
type=str, |
|
default='spans', |
|
help='the key where the note annotates spans are present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--format', |
|
type=str, |
|
default='jsonl', |
|
help='format to store the dataset in: jsonl or conll' |
|
) |
|
cli_parser.add_argument( |
|
'--output_file', |
|
type=str, |
|
help='The file where the NER dataset will be stored' |
|
) |
|
args = cli_parser.parse_args() |
|
dataset_creator = DatasetCreator( |
|
sentencizer=args.sentencizer, |
|
tokenizer=args.tokenizer, |
|
abbreviations=args.abbreviations, |
|
max_tokens=args.max_tokens, |
|
max_prev_sentence_token=args.max_prev_sentence_token, |
|
max_next_sentence_token=args.max_next_sentence_token, |
|
default_chunk_size=args.default_chunk_size, |
|
ignore_label=args.ignore_label) |
|
ner_notes = dataset_creator.create( |
|
input_file=args.input_file, |
|
mode=args.mode, |
|
notation=args.notation, |
|
token_text_key=args.token_text_key, |
|
metadata_key=args.metadata_key, |
|
note_id_key=args.note_id_key, |
|
label_key=args.label_key, |
|
span_text_key=args.span_text_key |
|
) |
|
|
|
if args.format == 'jsonl': |
|
|
|
with open(args.output_file, 'w') as file: |
|
for ner_sentence in ner_notes: |
|
file.write(json.dumps(ner_sentence) + '\n') |
|
elif args.format == 'conll': |
|
with open(args.output_file, 'w') as file: |
|
for ner_sentence in ner_notes: |
|
tokens = ner_sentence['tokens'] |
|
labels = ner_sentence['labels'] |
|
current_sent_info = ner_sentence['current_sent_info'] |
|
note_id = ner_sentence['note_sent_info']['note_id'] |
|
if len(tokens) != len(labels) or len(labels) != len(current_sent_info): |
|
raise ValueError('Length mismatch') |
|
for token, label, sent_info in zip(tokens, labels, current_sent_info): |
|
sent_info['note_id'] = note_id |
|
data = token + ' ' + label + ' ' + json.dumps(sent_info) + '\n' |
|
file.write(data) |
|
file.write('\n') |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
main() |
|
|