Spaces:
Sleeping
Sleeping
File size: 1,089 Bytes
5760b44 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
def create_baseline_pipeline() -> NerPipeline:
tokenizer = AutoTokenizer.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
model = AutoModelForTokenClassification.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
return pipeline('ner', model=model, tokenizer=tokenizer)
def _remove_punctuation(s: str) -> str:
to_remove = ".,?-:"
for char in to_remove:
s = s.replace(char, '')
return s
def _convert_pipeline_json_to_string(pipeline_json: list[dict]) -> str:
# TODO is it ok to remove redundant spaces, or should we keep input data as is and only touch commas?
# TODO don't accept tokens with commas inside words
return ''.join(
token['word'].replace('▁', ' ') + token['entity'].replace('0', '')
for token in pipeline_json
).strip()
def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
return _convert_pipeline_json_to_string(
ner_pipeline(_remove_punctuation(s))
)
|