File size: 1,089 Bytes
5760b44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline


def create_baseline_pipeline() -> NerPipeline:
    tokenizer = AutoTokenizer.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
    model = AutoModelForTokenClassification.from_pretrained("oliverguhr/fullstop-punctuation-multilang-large")
    return pipeline('ner', model=model, tokenizer=tokenizer)


def _remove_punctuation(s: str) -> str:
    to_remove = ".,?-:"
    for char in to_remove:
        s = s.replace(char, '')
    return s


def _convert_pipeline_json_to_string(pipeline_json: list[dict]) -> str:
    # TODO is it ok to remove redundant spaces, or should we keep input data as is and only touch commas?
    # TODO don't accept tokens with commas inside words
    return ''.join(
        token['word'].replace('▁', ' ') + token['entity'].replace('0', '')
        for token in pipeline_json
    ).strip()


def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
    return _convert_pipeline_json_to_string(
        ner_pipeline(_remove_punctuation(s))
    )