Medical-Note-Deidentification

Sleeping

File size: 3,281 Bytes

45c1511

import json
import random
from argparse import ArgumentParser
from typing import Union, NoReturn, Iterable, Dict, List

random.seed(41)


class SpanValidation(object):
    """
    This class is used to build  a mapping between the note id
    and the annotated spans in that note. This will be used during the
    evaluation of the models. This is required to perform span level
    evaluation.
    """
    @staticmethod
    def get_spans(
            input_file: str,
            metadata_key: str = 'meta',
            note_id_key: str = 'note_id',
            spans_key: str = 'spans'
    ):
        """
        Get a mapping between the note id
        and the annotated spans in that note. This will mainly be used during the
        evaluation of the models.
        Args:
            input_file (str): The input file
            metadata_key (str): The key where the note metadata is present
            note_id_key (str): The key where the note id is present
            spans_key (str): The key that contains the annotated spans for a note dictionary
        Returns:
            (Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note
                                                                     and contains the note id and annotated spans
                                                                     for that note
        """
        # Read the input files (data source)
        for line in open(input_file, 'r'):
            note = json.loads(line)
            note_id = note[metadata_key][note_id_key]
            # Store the note_id and the annotated spans
            note[spans_key].sort(key=lambda _span: (_span['start'], _span['end']))
            yield {'note_id': note_id, 'note_spans': note[spans_key]}


def main() -> NoReturn:
    cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
    cli_parser.add_argument(
        '--input_file',
        type=str,
        required=True,
        help='the the jsonl file that contains the notes'
    )
    cli_parser.add_argument(
        '--metadata_key',
        type=str,
        default='meta',
        help='the key where the note metadata is present in the json object'
    )
    cli_parser.add_argument(
        '--note_id_key',
        type=str,
        default='note_id',
        help='the key where the note id is present in the json object'
    )
    cli_parser.add_argument(
        '--span_text_key',
        type=str,
        default='spans',
        help='the key where the annotated spans for the notes are present in the json object'
    )
    cli_parser.add_argument(
        '--output_file',
        type=str,
        required=True,
        help='the file where the note id and the corresponding spans for that note are to be saved'
    )
    args = cli_parser.parse_args()

    # Write the dataset to the output file
    with open(args.output_file, 'w') as file:
        for span_info in SpanValidation.get_spans(
                input_file=args.input_file,
                metadata_key=args.metadata_key,
                note_id_key=args.note_id_key,
                spans_key=args.spans_key):
            file.write(json.dumps(span_info) + '\n')


if __name__ == "__main__":
    main()