Medical-Note-Deidentification

Sleeping

Medical-Note-Deidentification / ner_datasets /span_validation.py

Prajwal Kailas

dependency to run

45c1511 over 3 years ago

3.28 kB

	import json
	import random
	from argparse import ArgumentParser
	from typing import Union, NoReturn, Iterable, Dict, List

	random.seed(41)


	class SpanValidation(object):
	"""
	This class is used to build a mapping between the note id
	and the annotated spans in that note. This will be used during the
	evaluation of the models. This is required to perform span level
	evaluation.
	"""
	@staticmethod
	def get_spans(
	input_file: str,
	metadata_key: str = 'meta',
	note_id_key: str = 'note_id',
	spans_key: str = 'spans'
	):
	"""
	Get a mapping between the note id
	and the annotated spans in that note. This will mainly be used during the
	evaluation of the models.
	Args:
	input_file (str): The input file
	metadata_key (str): The key where the note metadata is present
	note_id_key (str): The key where the note id is present
	spans_key (str): The key that contains the annotated spans for a note dictionary
	Returns:
	(Iterable[Dict[str, Union[str, List[Dict[str, str]]]]]): An iterable that iterates through each note
	and contains the note id and annotated spans
	for that note
	"""
	# Read the input files (data source)
	for line in open(input_file, 'r'):
	note = json.loads(line)
	note_id = note[metadata_key][note_id_key]
	# Store the note_id and the annotated spans
	note[spans_key].sort(key=lambda _span: (_span['start'], _span['end']))
	yield {'note_id': note_id, 'note_spans': note[spans_key]}


	def main() -> NoReturn:
	cli_parser = ArgumentParser(description='configuration arguments provided at run time from the CLI')
	cli_parser.add_argument(
	'--input_file',
	type=str,
	required=True,
	help='the the jsonl file that contains the notes'
	)
	cli_parser.add_argument(
	'--metadata_key',
	type=str,
	default='meta',
	help='the key where the note metadata is present in the json object'
	)
	cli_parser.add_argument(
	'--note_id_key',
	type=str,
	default='note_id',
	help='the key where the note id is present in the json object'
	)
	cli_parser.add_argument(
	'--span_text_key',
	type=str,
	default='spans',
	help='the key where the annotated spans for the notes are present in the json object'
	)
	cli_parser.add_argument(
	'--output_file',
	type=str,
	required=True,
	help='the file where the note id and the corresponding spans for that note are to be saved'
	)
	args = cli_parser.parse_args()

	# Write the dataset to the output file
	with open(args.output_file, 'w') as file:
	for span_info in SpanValidation.get_spans(
	input_file=args.input_file,
	metadata_key=args.metadata_key,
	note_id_key=args.note_id_key,
	spans_key=args.spans_key):
	file.write(json.dumps(span_info) + '\n')


	if __name__ == "__main__":
	main()