Medical-Note-Deidentification

Sleeping

Medical-Note-Deidentification / ner_datasets /dataset_builder /sentence_dataset.py

Prajwal Kailas

dependency to run

45c1511 over 3 years ago

24.5 kB

	from collections import deque
	from typing import Deque, List, Sequence, Iterable, Optional, NoReturn, Dict, Mapping, Union, Tuple


	class SentenceDataset(object):
	"""
	When we mention previous sentence and next sentence, we don't mean exactly one sentence
	but rather a previous chunk and a next chunk. This can include one or more sentences and
	it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
	This class is used to build a dataset at the sentence
	level. It takes as input all the tokenized sentences in the note. So the input is
	a list of lists where the outer list represents the sentences in the note and the inner list
	is a list of tokens in the sentence. It then returns a dataset where each sentence is
	concatenated with the previous and a next chunk. This is done so that when we build a model
	we can use the previous and next chunks to add context to the sentence/model. The weights and loss etc
	will be computed and updated based on the current sentence. The previous and next chunks will
	only be used to add context. We could have different sizes of previous and next chunks
	depending on the position of the sentence etc. Essentially we build a sentence level dataset
	where we can also provide context to the sentence by including the previous and next chunks
	"""

	def __init__(
	self,
	max_tokens: int,
	max_prev_sentence_token: int,
	max_next_sentence_token: int,
	default_chunk_size: int,
	ignore_label: str
	) -> NoReturn:
	"""
	Set the maximum token length a given training example (sentence level) can have.
	That is the total length of the current sentence + previous chunk + next chunk
	We also set the the maximum length of the previous and next chunks. That is how many
	tokens can be in these chunks. However if the total length exceeds, tokens in the
	previous and next chunks will be dropped to ensure that the total length is < max_tokens
	The default chunk size ensures that the length of the chunks will be a minimum number of
	tokens based on the value passed. For example is default_chunk_size=10, the length
	of the previous chunks and next chunks will be at least 10 tokens.
	Args:
	max_tokens (int): maximum token length a given training example (sentence level) can have
	max_prev_sentence_token (int): The max chunk size for the previous chunks for a given sentence
	(training/prediction example) in the note can have
	max_next_sentence_token (int): The max chunk size for the next chunks for a given sentence
	(training/prediction example) in the note can have
	default_chunk_size (int): the training example will always include a chunk of this length
	as part of the previous and next chunks
	ignore_label (str): The label assigned to the previous and next chunks to distinguish
	from the current sentence
	"""
	self._id_num = None
	self._max_tokens = max_tokens
	self._max_prev_sentence_token = max_prev_sentence_token
	self._max_next_sentence_token = max_next_sentence_token
	self._default_chunk_size = default_chunk_size
	self._ignore_label = ignore_label

	@staticmethod
	def chunker(
	seq: Sequence[Mapping[str, Union[str, int]]],
	size: int
	) -> Iterable[Sequence[Mapping[str, Union[str, int]]]]:
	"""
	Return chunks of the sequence. The size of each chunk will be based
	on the value passed to the size argument.
	Args:
	seq (Sequence): maximum token length a given training example (sentence level) can have
	size (int): The max chunk size for the chunks
	Return:
	(Iterable[Sequence[Mapping[str, Union[str, int]]]]): Iterable that iterates through fixed size chunks of
	the input sequence chunked version of the sequence

	"""
	return (seq[pos:pos + size] for pos in range(0, len(seq), size))

	def get_previous_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
	"""
	Go through all the sentences in the medical note and create a list of
	previous sentences. The output of this function will be a list of chunks
	where each index of the list contains the sentences (chunks) - (tokens) present before
	the sentence at that index in the medical note. For example prev_sent[0] will
	be empty since there is no sentence before the first sentence in the note
	prev_sent[1] will be equal to sent[0], that is the previous sentence of the
	second sentence will be the first sentence. We make use of deque, where we
	start to deque elements when it start to exceed max_prev_sentence_token. This
	list of previous sentences will be used to define the previous chunks
	Args:
	sent_tokens (Sequence[str]): Sentences in the note and
	each element of the list contains a
	list of tokens in that sentence
	Returns:
	previous_sentences (List[deque]): A list of deque objects where each index contains a
	list (queue) of previous tokens (chunk) with respect
	to the sentence represented by that index in the note
	"""
	previous_sentences = list()
	# Create a queue and specify the capacity of the queue
	# Tokens will be popped from the queue when the capacity is exceeded
	prev_sentence = deque(maxlen=self._max_prev_sentence_token)
	# The first previous chunk is empty since the first sentence in the note does not have
	# anything before it
	previous_sentences.append(prev_sentence.copy())
	# As we iterate through the list of sentences in the not, we add the tokens from the previous chunks
	# to the the queue. Since we have a queue, as soon as the capacity is exceeded we pop tokens from
	# the queue
	for sent_token in sent_tokens[:-1]:
	for token in sent_token:
	prev_sentence.append(token)
	# As soon as each sentence in the list is processed
	# We add a copy of the current queue to a list - this list keeps track of the
	# previous chunks for a sentence
	previous_sentences.append(prev_sentence.copy())

	return previous_sentences

	def get_next_sentences(self, sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]]) -> List[Deque]:
	"""
	Go through all the sentences in the medical note and create a list of
	next sentences. The output of this function will be a list of lists
	where each index of the list contains the list of sentences present after
	the sentence at that index in the medical note. For example next_sent[-] will
	be empty since there is no sentence after the last sentence in the note
	next_sent[0] will be equal to sent[1:], that is the next sentence of the
	first sentence will be the subsequent sentences. We make use of deque, where we
	start to deque elements when it start to exceed max_next_sentence_token. This
	list of previous sentences will be used to define the previous chunks
	Args:
	sent_tokens (Sequence[str]): Sentences in the note and each
	element of the list contains a
	list of tokens in that sentence
	Returns:
	next_sentences (List[deque]): A list of deque objects where each index contains a list (queue)
	of next tokens (chunk) with respect to the sentence represented
	by that index in the note
	"""
	# A list of next sentences is first created and reversed
	next_sentences = list()
	# Create a queue and specify the capacity of the queue
	# Tokens will be popped from the queue when the capacity is exceeded
	next_sentence = deque(maxlen=self._max_next_sentence_token)
	# The first (which becomes the last chunk when we reverse this list) next chunk is empty since
	# the last sentence in the note does not have
	# anything after it
	next_sentences.append(next_sentence.copy())
	for sent_token in reversed(sent_tokens[1:]):
	for token in reversed(sent_token):
	next_sentence.appendleft(token)
	next_sentences.append(next_sentence.copy())
	# The list is reversed - since we went through the sentences in the reverse order in
	# the earlier steps
	return [next_sent for next_sent in reversed(next_sentences)]

	def get_sentences(
	self,
	sent_tokens: Sequence[Sequence[Mapping[str, Union[str, int]]]],
	token_text_key: str = 'text',
	label_key: str = 'label',
	start_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
	end_chunk: Optional[Sequence[Mapping[str, Union[str, int]]]] = None,
	sub: bool = False
	) -> Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]:
	"""
	When we mention previous sentence and next sentence, we don't mean exactly one sentence
	but rather a previous chunk and a next chunk. This can include one or more sentences and
	it does not mean that the sentence has to be complete (it can be cutoff in between) - hence a chunk
	We iterate through all the tokenized sentences in the note. So the input is
	a list of lists where the outer list represents the sentences in the note and the inner list
	is a list of tokens in the sentence. It then returns a dataset where each sentence is
	concatenated with the previous and the next sentence. This is done so that when we build a model
	we can use the previous and next sentence to add context to the model. The weights and loss etc
	will be computed and updated based on the current sentence. The previous and next sentence will
	only be used to add context. We could have different sizes of previous and next chunks
	depending on the position of the sentence etc. Since we split a note in several sentences which are
	then used as training data.
	ignore_label is used to differentiate between the current sentence and the previous and next
	chunks. The chunks will have the label NA so that and the current sentence
	will have the label (DATE, AGE etc) so that they can be distinguished.
	If however we are building a dataset for predictions
	the current sentence will have the default label O, but the next and previous chunks will still
	have the label NA. However if the total length exceeds, tokens in the
	previous and next chunks will be dropped to ensure that the total length is < max_tokens
	The default chunk size ensures that the length of the chunks will be a minimum number of
	tokens based on the value passed. For example is default_chunk_size=10, the length
	of the previous chunks and next chunks will be at least 10 tokens. If the total length > max tokens
	even after decreasing the sizes of the previous and next chunks, then we split this long
	sentence into sub sentences and repeat the process described above.
	Args:
	sent_tokens (Sequence[Sequence[Mapping[str, Union[str, int]]]]): Sentences in the note and each sentence
	contains the tokens (dict) in that sentence
	the token dict object contains the
	token text, start, end etc
	token_text_key (str): Each sentence contains a list of tokens where each token is a dict. We use the text
	key to extract the text of the token from the dictionary
	label_key (str): Each sentence contains a list of tokens where each token is a dict. We use the label_key
	key to extract the label of the token from the dictionary. (if it does not have a label
	the default label will be assigned)
	start_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Prefix the first sentence of with some
	pre-defined chunk
	end_chunk (Optional[Sequence[Mapping[str, Union[str, int]]]]): Suffix the last sentence of with some
	pre-defined chunk
	sub (bool): Whether the function is called to process sub-sentences (used when we are splitting
	long sentences into smaller sub sentences to keep sentence length < max_tokens
	Returns:
	(Iterable[Tuple[int, Dict[str, Union[List[Dict[str, Union[str, int]]], List[str]]]]]): Iterate through the
	returned sentences,
	where each sentence
	has the previous
	chunks and next
	chunks attached
	to it.
	"""
	# Id num keeps track of the id of the sentence - that is the position the sentence occurs in
	# the note. We keep the id of sub sentences the same as the sentence, so that the user
	# knows that these sub sentences are chunked from a longer sentence.
	# <SENT 0> <SENT 1>. Say length of sent 0 with the previous and next chunks is less than max_tokens
	# we return sent 0 with id 0. For sent 1, say the length is longer, we split it into sub
	# sentences - <SUB 1><SUB 2> - we return SUB 1, and SUB 2 with id 1 - so we know that it belongs
	# to <SENT 1> in the note.
	if not sub:
	self._id_num = -1
	# Initialize the object that will take all the sentences in the note and return
	# a dataset where each row represents a sentence in the note. The sentence in each
	# row will also contain a previous chunk and next chunk (tokens) that will act as context
	# when training the mode
	# [ps1, ps 2, ps 3...ps-i], [cs1, cs2, ... cs-j], [ns, ns, ... ns-k] - as you can see the current sentence
	# which is the sentence we train on (or predict on) will be in the middle - the surrounding tokens will
	# provide context to the current sentence
	# Get the previous sentences (chunks) for each sentence in the note
	previous_sentences = self.get_previous_sentences(sent_tokens)
	# Get the next sentences (chunks) for each sentence in the note
	next_sentences = self.get_next_sentences(sent_tokens)
	# For the note we are going to iterate through all the sentences in the note and
	# concatenate each sentence with the previous and next chunks. (This forms the data that
	# will be used for training/predictions) Each sentence with the concatenated chunks will be
	# a training sample. We would do the same thing for getting predictions on a sentence as well
	# The only difference would be the labels that are used. We would use the default label O for
	# prediction and the annotated labels for prediction
	if len(sent_tokens) != len(previous_sentences) or len(sent_tokens) != len(next_sentences):
	raise ValueError('Sentence length mismatch')
	for index, (previous_sent, current_sent, next_sent) in enumerate(
	zip(previous_sentences, sent_tokens, next_sentences)):
	sent_tokens_text = list()
	sent_labels = list()
	sent_toks = list()
	# Get the tokens and labels for the current sentence
	for token in current_sent:
	# We store this, if we need to process sub sentences when a sentence exceeds max_tokens
	sent_toks.append(token)
	sent_tokens_text.append(token[token_text_key])
	sent_labels.append(token[label_key])
	# We check if the number of tokens in teh current sentence + previous chunk
	# + next chunk exceeds max tokens. If it does we start popping tokens from the previous and next chunks
	# until the number of tokens is equal to max tokens
	previous_sent_length = len(previous_sent)
	current_sent_length = len(sent_tokens_text)
	next_sent_length = len(next_sent)
	total_length = previous_sent_length + current_sent_length + next_sent_length
	# If the length of the current sentence plus the length of the previous and next
	# chunks exceeds the max_tokens, start popping tokens from the previous and next
	# chunks until either total length < max_tokens or the number of tokens in the previous and
	# next chunks goes below the default chunk size
	while total_length > self._max_tokens and \
	(next_sent_length > self._default_chunk_size or previous_sent_length > self._default_chunk_size):
	if next_sent_length >= previous_sent_length:
	next_sent.pop()
	next_sent_length -= 1
	total_length -= 1
	elif previous_sent_length > next_sent_length:
	previous_sent.popleft()
	previous_sent_length -= 1
	total_length -= 1
	# If this is not a sub sentence, increment the ID to
	# indicate the processing of the next sentence of the note
	# If it is a sub sentence, keep the ID the same, to indicate
	# it belongs to a larger sentence
	if not sub:
	self._id_num += 1
	# If total length < max_tokens - process the sentence with the current sentence
	# and add on the previous and next chunks and return
	if total_length <= self._max_tokens:
	# Check if we want to add a pre-defined chunk for the first sentence in the note
	if index == 0 and start_chunk is not None:
	previous_sent_tokens = [chunk[token_text_key] for chunk in start_chunk] + \
	[prev_token[token_text_key] for prev_token in list(previous_sent)]
	else:
	previous_sent_tokens = [prev_token[token_text_key] for prev_token in list(previous_sent)]
	# Check if we want to add a pre-defined chunk for the last sentence in the note
	if index == len(sent_tokens) - 1 and end_chunk is not None:
	next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)] + \
	[chunk[token_text_key] for chunk in end_chunk]
	else:
	next_sent_tokens = [next_token[token_text_key] for next_token in list(next_sent)]
	previous_sent_length = len(previous_sent_tokens)
	next_sent_length = len(next_sent_tokens)
	# Store information about the current sentence - start and end pos etc
	# this can be used to distinguish from the next and previous chunks
	# current_sent_info = {'token_info':current_sent}
	# Assign an different label (the ignore label) to the chunks - since they are used only for context
	previous_sent_labels = list()
	next_sent_labels = list()
	if self._ignore_label == 'NA':
	previous_sent_labels = [self._ignore_label] * previous_sent_length
	next_sent_labels = [self._ignore_label] * next_sent_length
	elif self._ignore_label == 'label':
	if index == 0 and start_chunk is not None:
	previous_sent_labels = [chunk[label_key] for chunk in start_chunk] + \
	[prev_token[label_key] for prev_token in list(previous_sent)]
	else:
	previous_sent_labels = [prev_token[label_key] for prev_token in list(previous_sent)]
	if index == len(sent_tokens) - 1 and end_chunk is not None:
	next_sent_labels = [next_token[label_key] for next_token in list(next_sent)] + \
	[chunk[label_key] for chunk in end_chunk]
	else:
	next_sent_labels = [next_token[label_key] for next_token in list(next_sent)]
	# Concatenate the chunks and the sentence
	# sent_tokens_text.append(token[token_text_key])
	tokens_data = previous_sent_tokens + sent_tokens_text + next_sent_tokens
	labels_data = previous_sent_labels + sent_labels + next_sent_labels
	# Return processed sentences
	yield self._id_num, {'tokens': tokens_data, 'labels': labels_data, 'current_sent_info': current_sent}
	# Process the sub sentences - we take a long sentence
	# and split it into smaller chunks - and we recursively call the function on this list
	# of smaller chunks - as mentioned before the smaller chunks (sub sentences) will have the
	# same ID as the original sentence
	else:
	# Store the smaller chunks - say <SENT1> is too long
	# <PREV CHUNK><SENT1><NEXT CHUNK>
	# We get chunk sent 1 - to <SUB1><SUB2><SUB3> and we pass this [<SUB1><SUB2><SUB3>] to the function
	# as a recursive call. This list is now processed as a smaller note that essentially belongs
	# to a sentence. But as you can see we did not pass <PREV CHUNK> & <NEXT CHUNK>, because
	# these are chunks that are not part of the current sentence, but they still need to be
	# included in the final output - and the work around is mentioned below
	# So that we have a previous chunk for <SUB1> and next chunk for <SUB3>
	# we include the previous_sent_tokens and next_sent_tokens as the start chunk
	# and the next chunk in the function call below
	# <PREV CHUNK><SUB1><NEXT SUB1>, id = x
	# <PREV SUB2><SUB2><NEXT SUB2>, id = x
	# <PREV SUB3><SUB3><NEXT CHUNK>, id = x
	sub_sentences = list()
	# Prefix the first sentence in these smaller chunks
	previous_sent_tokens = list(previous_sent)
	# Suffix the last sentence in these smaller chunks
	next_sent_tokens = list(next_sent)
	# Get chunks
	for chunk in SentenceDataset.chunker(sent_toks, self._max_tokens - (2 * self._default_chunk_size)):
	sub_sentences.append(chunk)
	# Process list of smaller chunks
	for sub_sent in self.get_sentences(
	sub_sentences,
	token_text_key,
	label_key,
	start_chunk=previous_sent_tokens,
	end_chunk=next_sent_tokens,
	sub=True
	):
	yield sub_sent