impresso-project
/

ocr-quality-assessor-unigram-light

Token Classification

ocr_qa_assessment

quality-assessment

Model card Files Files and versions Community

emanuelaboros commited on Apr 15

Commit

b828aa0

·

1 Parent(s): 197e8c2

repaired import

Files changed (1) hide show

modeling_ocrqa.py +5 -9

modeling_ocrqa.py CHANGED Viewed

@@ -45,14 +45,6 @@ def normalize_text(s: str, unicode_normalize: Optional[str] = "NFKC") -> str:
         s = unicodedata.normalize(unicode_normalize, s).lower()
     return s.translate(NORMALIZATION_TABLE)
-MODEL_NAME = "impresso-project/impresso-langident"
-lang_pipeline = pipeline(
-    "langident",
-    model=MODEL_NAME,
-    trust_remote_code=True,
-    device="cpu",
-)
 def filter_text(text: str, bloom_filter: BloomFilter):
@@ -100,6 +92,10 @@ class QAAssessmentModel(PreTrainedModel):
                 # print(f"{bin_filename} not found locally, downloading from Hugging Face hub...")
                 self.ocrqa_assessors[lang] = hf_hub_download(repo_id=self.config.config._name_or_path,
                                                              filename=model_filename)
     def forward(self, input_ids, **kwargs):
         if isinstance(input_ids, str):
@@ -112,7 +108,7 @@ class QAAssessmentModel(PreTrainedModel):
         predictions, probabilities = [], []
         for text in texts:
-            langs = lang_pipeline(input_ids)
             # [{'label': 'fr', 'confidence': 99.87}]
             if len(langs) > 0:
                 lang = langs[0]['label']

         s = unicodedata.normalize(unicode_normalize, s).lower()
     return s.translate(NORMALIZATION_TABLE)
 def filter_text(text: str, bloom_filter: BloomFilter):
                 # print(f"{bin_filename} not found locally, downloading from Hugging Face hub...")
                 self.ocrqa_assessors[lang] = hf_hub_download(repo_id=self.config.config._name_or_path,
                                                              filename=model_filename)
+        self.lang_pipeline = pipeline("langident",
+                                    model="impresso-project/impresso-langident",
+                                    trust_remote_code=True,
+                                    device="cpu")
     def forward(self, input_ids, **kwargs):
         if isinstance(input_ids, str):
         predictions, probabilities = [], []
         for text in texts:
+            langs = self.lang_pipeline(input_ids)
             # [{'label': 'fr', 'confidence': 99.87}]
             if len(langs) > 0:
                 lang = langs[0]['label']