File size: 1,574 Bytes
dd7931c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from flair.nn import Classifier
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import Tokenizer
from segtok.segmenter import split_single
from collections import Counter
import pandas as pd
import os


ner_tagger = SequenceTagger.load("flair/ner-english-ontonotes")
pos_tagger = Classifier.load("pos")


def get_named_entities(text: str, tagger=ner_tagger):
    sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(text)]
    tagger.predict(sentence)

    entities = []

    for token in sentence:
        for entity in token.get_spans("ner"):
            entity = str(entity)
            entities.append(entity)

    return entities


def get_most_frequent_words(dataset: str, k=10):

    split_str = dataset.split()

    counter = Counter(split_str)

    most_frequent = counter.most_common(k)

    return most_frequent


# POS categories https://huggingface.co/flair/pos-english


def get_parts_of_sentence(text: str, tagger=pos_tagger):

    sentence = Sentence(text)

    tagger.predict(sentence)

    return sentence


# path_stem = os.path.join("datasets")

# file_name = "ch3_colour_data_viz_suggestions_set_2"

# ner_output_path = os.path.join(path_stem, f"{file_name}_ner.csv")

# df = pd.read_csv(ner_output_path)

# df = df.head(3)


# ner_dataset = df["alma_metadata"].to_list()
# ner_dataset = " ".join(ner_dataset)

# tokenizer = Tokenizer()

# tokens = Tokenizer.tokenize(Tokenizer, ner_dataset)

# print(tokens)
# # most_common = get_most_frequent_words(ner_dataset)

# print(most_common)