|
import json |
|
import random |
|
from argparse import ArgumentParser |
|
from typing import Union, NoReturn, Iterable, Dict, List |
|
|
|
random.seed(41) |
|
|
|
|
|
class SpanValidation(object): |
|
""" |
|
This class is used to build a mapping between the note id |
|
and the annotated spans in that note. This will be used during the |
|
evaluation of the models. This is required to perform span level |
|
evaluation. |
|
""" |
|
@staticmethod |
|
def get_spans( |
|
input_file: str, |
|
metadata_key: str = 'meta', |
|
note_id_key: str = 'note_id', |
|
spans_key: str = 'spans' |
|
): |
|
""" |
|
Get a mapping between the note id |
|
and the annotated spans in that note. This will mainly be used during the |
|
evaluation of the models. |
|
Args: |
|
input_file (str): The input file |
|
metadata_key (str): The key where the note metadata is present |
|
note_id_key (str): The key where the note id is present |
|
spans_key (str): The key that contains the annotated spans for a note dictionary |
|
Returns: |
|
(Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note |
|
and contains the note id and annotated spans |
|
for that note |
|
""" |
|
|
|
for line in open(input_file, 'r'): |
|
note = json.loads(line) |
|
note_id = note[metadata_key][note_id_key] |
|
|
|
note[spans_key].sort(key=lambda _span: (_span['start'], _span['end'])) |
|
yield {'note_id': note_id, 'note_spans': note[spans_key]} |
|
|
|
|
|
def main() -> NoReturn: |
|
cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI') |
|
cli_parser.add_argument( |
|
'--input_file', |
|
type=str, |
|
required=True, |
|
help='the the jsonl file that contains the notes' |
|
) |
|
cli_parser.add_argument( |
|
'--metadata_key', |
|
type=str, |
|
default='meta', |
|
help='the key where the note metadata is present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--note_id_key', |
|
type=str, |
|
default='note_id', |
|
help='the key where the note id is present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--span_text_key', |
|
type=str, |
|
default='spans', |
|
help='the key where the annotated spans for the notes are present in the json object' |
|
) |
|
cli_parser.add_argument( |
|
'--output_file', |
|
type=str, |
|
required=True, |
|
help='the file where the note id and the corresponding spans for that note are to be saved' |
|
) |
|
args = cli_parser.parse_args() |
|
|
|
|
|
with open(args.output_file, 'w') as file: |
|
for span_info in SpanValidation.get_spans( |
|
input_file=args.input_file, |
|
metadata_key=args.metadata_key, |
|
note_id_key=args.note_id_key, |
|
spans_key=args.spans_key): |
|
file.write(json.dumps(span_info) + '\n') |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|