Spaces:
Sleeping
Sleeping
File size: 1,574 Bytes
dd7931c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
from flair.nn import Classifier
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.tokenization import Tokenizer
from segtok.segmenter import split_single
from collections import Counter
import pandas as pd
import os
ner_tagger = SequenceTagger.load("flair/ner-english-ontonotes")
pos_tagger = Classifier.load("pos")
def get_named_entities(text: str, tagger=ner_tagger):
sentence = [Sentence(sent, use_tokenizer=True) for sent in split_single(text)]
tagger.predict(sentence)
entities = []
for token in sentence:
for entity in token.get_spans("ner"):
entity = str(entity)
entities.append(entity)
return entities
def get_most_frequent_words(dataset: str, k=10):
split_str = dataset.split()
counter = Counter(split_str)
most_frequent = counter.most_common(k)
return most_frequent
# POS categories https://huggingface.co/flair/pos-english
def get_parts_of_sentence(text: str, tagger=pos_tagger):
sentence = Sentence(text)
tagger.predict(sentence)
return sentence
# path_stem = os.path.join("datasets")
# file_name = "ch3_colour_data_viz_suggestions_set_2"
# ner_output_path = os.path.join(path_stem, f"{file_name}_ner.csv")
# df = pd.read_csv(ner_output_path)
# df = df.head(3)
# ner_dataset = df["alma_metadata"].to_list()
# ner_dataset = " ".join(ner_dataset)
# tokenizer = Tokenizer()
# tokens = Tokenizer.tokenize(Tokenizer, ner_dataset)
# print(tokens)
# # most_common = get_most_frequent_words(ner_dataset)
# print(most_common)
|