import json import spacy from spacy.language import Language import os class PipelineWrapper: """ Pipeline Wrapper for the project 'Frauenerwerbstätigkeit' with the Bertelsmann Foundation team. Loads pre-defined patterns from a json file and adds them to a span ruler in a spacy pipeline """ def __init__(self, path: str) -> None: self.patterns = [] self.nlp: Language = spacy.load("de_core_news_sm") self.load_patterns(os.path.join(path, "data", "2024-08-16_patterns_1192.json")) self.add_span_ruler() def load_patterns(self, path: str): """ load patterns from json file in spacy pattern format Parameters ---------- path: path to pattern json file """ with open(file=path, mode="r") as fp: self.patterns = json.load(fp=fp) def add_span_ruler(self): """ Add a span ruler to the nlp pipeline """ config = {"spans_key": None, "annotate_ents": True, "overwrite": False} ruler = self.nlp.add_pipe("span_ruler") ruler.add_patterns(self.patterns) def __call__(self, queries: list)-> None: """ call method for pipeline """ return self.bulk_predict(queries=queries) def bulk_predict(self, queries: list) -> list: """ Bulk predicts the classes Parameters ---------- queries: list of dictionaries containing this stucture: {"posting_id": uuid, "text": str} Returns ---------- list of dictionaries containing this structure: [{"posting_id": , "concept": , }, {"posting_id": , "concept": ...}] """ extractions = [] for entry, doc in zip(queries, self.nlp.pipe((q["text"].lower() for q in queries), disable=["ner", "tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])): extraction = [span.label_ for span in doc.spans["ruler"]] # no concept found if not extraction: extractions.append({"posting_id": entry["posting_id"], "concept": None}) # concept found else: # one entry for each found concept for el in extraction: extractions.append({"posting_id": entry["posting_id"], "concept": el}) return extractions