pierreguillou commited on
Commit
c77dc40
·
0 Parent(s):

Duplicate from pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2

Browse files
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Document Understanding Inference APP (v2 - paragraph level - LayoutXLM base)
3
+ emoji: 🐢
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.18.0
8
+ app_file: app.py
9
+ pinned: false
10
+ models:
11
+ - >-
12
+ pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512
13
+ duplicated_from: pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2
14
+ ---
15
+
16
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
4
+ # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
5
+ os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
6
+
7
+ # install detectron2 that matches pytorch 1.8
8
+ # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
9
+ #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
10
+ os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
11
+
12
+ import detectron2
13
+ from detectron2.utils.logger import setup_logger
14
+ setup_logger()
15
+
16
+ import gradio as gr
17
+ import re
18
+ import string
19
+
20
+ from operator import itemgetter
21
+ import collections
22
+
23
+ import pypdf
24
+ from pypdf import PdfReader
25
+ from pypdf.errors import PdfReadError
26
+
27
+ import pdf2image
28
+ from pdf2image import convert_from_path
29
+ import langdetect
30
+ from langdetect import detect_langs
31
+
32
+ import pandas as pd
33
+ import numpy as np
34
+ import random
35
+ import tempfile
36
+ import itertools
37
+
38
+ from matplotlib import font_manager
39
+ from PIL import Image, ImageDraw, ImageFont
40
+ import cv2
41
+
42
+ ## files
43
+
44
+ import sys
45
+ sys.path.insert(0, 'files/')
46
+
47
+ import functions
48
+ from functions import *
49
+
50
+ # update pip
51
+ os.system('python -m pip install --upgrade pip')
52
+
53
+ ## model / feature extractor / tokenizer
54
+
55
+ import torch
56
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
57
+
58
+ # model
59
+ from transformers import LayoutLMv2ForTokenClassification
60
+
61
+ model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
62
+
63
+ model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
64
+ model.to(device);
65
+
66
+ # feature extractor
67
+ from transformers import LayoutLMv2FeatureExtractor
68
+ feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
69
+
70
+ # tokenizer
71
+ from transformers import AutoTokenizer
72
+ tokenizer_id = "xlm-roberta-base"
73
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
74
+
75
+ # get labels
76
+ id2label = model.config.id2label
77
+ label2id = model.config.label2id
78
+ num_labels = len(id2label)
79
+
80
+ # APP outputs
81
+ def app_outputs(uploaded_pdf):
82
+ filename, msg, images = pdf_to_images(uploaded_pdf)
83
+ num_images = len(images)
84
+
85
+ if not msg.startswith("Error with the PDF"):
86
+
87
+ # Extraction of image data (text and bounding boxes)
88
+ dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images)
89
+ # prepare our data in the format of the model
90
+ encoded_dataset = dataset.map(prepare_inference_features_paragraph, batched=True, batch_size=64, remove_columns=dataset.column_names)
91
+ custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer)
92
+ # Get predictions (token level)
93
+ outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset)
94
+ # Get predictions (paragraph level)
95
+ probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes)
96
+ # Get labeled images with lines bounding boxes
97
+ images = get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict)
98
+
99
+ img_files = list()
100
+ # get image of PDF without bounding boxes
101
+ for i in range(num_images):
102
+ if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png")
103
+ else: img_file = filename.replace(".pdf", ".png")
104
+ img_file = img_file.replace("/", "_")
105
+ images[i].save(img_file)
106
+ img_files.append(img_file)
107
+
108
+ if num_images < max_imgboxes:
109
+ img_files += [image_blank]*(max_imgboxes - num_images)
110
+ images += [Image.open(image_blank)]*(max_imgboxes - num_images)
111
+ for count in range(max_imgboxes - num_images):
112
+ df[num_images + count] = pd.DataFrame()
113
+ else:
114
+ img_files = img_files[:max_imgboxes]
115
+ images = images[:max_imgboxes]
116
+ df = dict(itertools.islice(df.items(), max_imgboxes))
117
+
118
+ # save
119
+ csv_files = list()
120
+ for i in range(max_imgboxes):
121
+ csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv")
122
+ csv_file = csv_file.replace("/", "_")
123
+ csv_files.append(gr.File.update(value=csv_file, visible=True))
124
+ df[i].to_csv(csv_file, encoding="utf-8", index=False)
125
+
126
+ else:
127
+ img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes
128
+ img_files[0], img_files[1] = image_blank, image_blank
129
+ images[0], images[1] = Image.open(image_blank), Image.open(image_blank)
130
+ csv_file = "csv_wo_content.csv"
131
+ csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True)
132
+ df, df_empty = dict(), pd.DataFrame()
133
+ df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False)
134
+
135
+ return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1]
136
+
137
+ # Gradio APP
138
+ with gr.Blocks(title="Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)", css=".gradio-container") as demo:
139
+ gr.HTML("""
140
+ <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at paragraph level (v2 - LayoutXLM base)</h1></div>
141
+ <div style="margin-top: 40px"><p>(03/31/2023) This Inference APP uses the <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">model Layout XLM base combined with XLM-RoBERTa base and finetuned on the dataset DocLayNet base at paragraph level</a> (chunk size of 512 tokens).</p></div>
142
+ <div><p><a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://arxiv.org/abs/2104.08836" target="_blank">LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding</a> is a Document Understanding model that uses both layout and text in order to detect labels of bounding boxes. Combined with the model <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/xlm-roberta-base" target="_blank">XML-RoBERTa base</a>, this finetuned model has the capacity to <b>understand any language</b>. Finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a>, it can <b>classifly any bounding box (and its OCR text) to 11 labels</b> (Caption, Footnote, Formula, List-item, Page-footer, Page-header, Picture, Section-header, Table, Text, Title).</p></div>
143
+ <div><p>It relies on an external OCR engine to get words and bounding boxes from the document image. Thus, let's run in this APP an OCR engine (<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/madmaze/pytesseract#python-tesseract" target="_blank">PyTesseract</a>) to get the bounding boxes, then run Layout XLM base (already fine-tuned on the dataset DocLayNet base at paragraph level) on the individual tokens and then, visualize the result at paragraph level!</p></div>
144
+ <div><p><b>It allows to get all pages of any PDF (of any language) with bounding boxes labeled at paragraph level and the associated dataframes with labeled data (bounding boxes, texts, labels) :-)</b></p></div>
145
+ <div><p>However, the inference time per page can be high when running the model on CPU due to the number of paragraph predictions to be made. Therefore, to avoid running this APP for too long, <b>only the first 2 pages are processed by this APP</b>. If you want to increase this limit, you can either clone this APP in Hugging Face Space (or run its <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/Gradio_inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb" target="_blank">notebook</a> on your own plateform) and change the value of the parameter <code>max_imgboxes</code>, or run the inference notebook "<a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://github.com/piegu/language-models/blob/master/inference_on_LayoutXLM_base_model_finetuned_on_DocLayNet_base_in_any_language_at_levelparagraphs_ml512.ipynb" target="_blank">Document AI | Inference at paragraph level with a Document Understanding model (LayoutXLM base fine-tuned on DocLayNet dataset)</a>" on your own platform as it does not have this limit.</p></div>
146
+ <div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p>
147
+ <ul><li>(03/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-3507af80573d" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level with LayoutXLM base</a></li><li>(03/25/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-app-to-compare-the-document-understanding-lilt-and-layoutxlm-base-models-at-line-1c53eb481a15" target="_blank">Document AI | APP to compare the Document Understanding LiLT and LayoutXLM (base) models at line level</a></li><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div>
148
+ """)
149
+ with gr.Row():
150
+ pdf_file = gr.File(label="PDF")
151
+ with gr.Row():
152
+ submit_btn = gr.Button(f"Display first {max_imgboxes} labeled PDF pages")
153
+ reset_btn = gr.Button(value="Clear")
154
+ with gr.Row():
155
+ output_msg = gr.Textbox(label="Output message")
156
+ with gr.Row():
157
+ fileboxes = []
158
+ for num_page in range(max_imgboxes):
159
+ file_path = gr.File(visible=True, label=f"Image file of the PDF page n°{num_page}")
160
+ fileboxes.append(file_path)
161
+ with gr.Row():
162
+ imgboxes = []
163
+ for num_page in range(max_imgboxes):
164
+ img = gr.Image(type="pil", label=f"Image of the PDF page n°{num_page}")
165
+ imgboxes.append(img)
166
+ with gr.Row():
167
+ csvboxes = []
168
+ for num_page in range(max_imgboxes):
169
+ csv = gr.File(visible=True, label=f"CSV file at paragraph level (page {num_page})")
170
+ csvboxes.append(csv)
171
+ with gr.Row():
172
+ dfboxes = []
173
+ for num_page in range(max_imgboxes):
174
+ df = gr.Dataframe(
175
+ headers=["bounding boxes", "texts", "labels"],
176
+ datatype=["str", "str", "str"],
177
+ col_count=(3, "fixed"),
178
+ visible=True,
179
+ label=f"Data of page {num_page}",
180
+ type="pandas",
181
+ wrap=True
182
+ )
183
+ dfboxes.append(df)
184
+
185
+ outputboxes = [output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
186
+ submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes)
187
+ # https://github.com/gradio-app/gradio/pull/2044/files#diff-a91dd2749f68bb7d0099a0f4079a4fd2d10281e299e7b451cb1bb876a7c21975R91
188
+ reset_btn.click(
189
+ lambda: [pdf_file.update(value=None), output_msg.update(value=None)] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes],
190
+ inputs=[],
191
+ outputs=[pdf_file, output_msg] + fileboxes + imgboxes + csvboxes + dfboxes
192
+ )
193
+
194
+ gr.Examples(
195
+ [["files/example.pdf"]],
196
+ [pdf_file],
197
+ outputboxes,
198
+ fn=app_outputs,
199
+ cache_examples=True,
200
+ )
201
+
202
+ demo.launch()
files/README.md ADDED
File without changes
files/blank.pdf ADDED
Binary file (1.15 kB). View file
 
files/blank.png ADDED
files/example.pdf ADDED
Binary file (343 kB). View file
 
files/functions.py ADDED
@@ -0,0 +1,882 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
4
+ # os.system('pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html')
5
+ os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html')
6
+
7
+ # install detectron2 that matches pytorch 1.8
8
+ # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
9
+ #os.system('pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html')
10
+ os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
11
+
12
+ import detectron2
13
+ from detectron2.utils.logger import setup_logger
14
+ setup_logger()
15
+
16
+ import gradio as gr
17
+ import re
18
+ import string
19
+ import torch
20
+
21
+ from operator import itemgetter
22
+ import collections
23
+
24
+ import pypdf
25
+ from pypdf import PdfReader
26
+ from pypdf.errors import PdfReadError
27
+
28
+ import pdf2image
29
+ from pdf2image import convert_from_path
30
+ import langdetect
31
+ from langdetect import detect_langs
32
+
33
+ import pandas as pd
34
+ import numpy as np
35
+ import random
36
+ import tempfile
37
+ import itertools
38
+
39
+ from matplotlib import font_manager
40
+ from PIL import Image, ImageDraw, ImageFont
41
+ import cv2
42
+
43
+ import pathlib
44
+ from pathlib import Path
45
+ import shutil
46
+
47
+ # Tesseract
48
+ print(os.popen(f'cat /etc/debian_version').read())
49
+ print(os.popen(f'cat /etc/issue').read())
50
+ print(os.popen(f'apt search tesseract').read())
51
+ import pytesseract
52
+
53
+ ## Key parameters
54
+
55
+ # categories colors
56
+ label2color = {
57
+ 'Caption': 'brown',
58
+ 'Footnote': 'orange',
59
+ 'Formula': 'gray',
60
+ 'List-item': 'yellow',
61
+ 'Page-footer': 'red',
62
+ 'Page-header': 'red',
63
+ 'Picture': 'violet',
64
+ 'Section-header': 'orange',
65
+ 'Table': 'green',
66
+ 'Text': 'blue',
67
+ 'Title': 'pink'
68
+ }
69
+
70
+ # bounding boxes start and end of a sequence
71
+ cls_box = [0, 0, 0, 0]
72
+ sep_box = [1000, 1000, 1000, 1000]
73
+
74
+ # model
75
+ model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512"
76
+
77
+ # tokenizer
78
+ tokenizer_id = "xlm-roberta-base"
79
+
80
+ # (tokenization) The maximum length of a feature (sequence)
81
+ if str(384) in model_id:
82
+ max_length = 384
83
+ elif str(512) in model_id:
84
+ max_length = 512
85
+ else:
86
+ print("Error with max_length of chunks!")
87
+
88
+ # (tokenization) overlap
89
+ doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
90
+
91
+ # max PDF page images that will be displayed
92
+ max_imgboxes = 2
93
+
94
+ # get files
95
+ examples_dir = 'files/'
96
+ Path(examples_dir).mkdir(parents=True, exist_ok=True)
97
+ from huggingface_hub import hf_hub_download
98
+ files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"]
99
+ for file_name in files:
100
+ path_to_file = hf_hub_download(
101
+ repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2",
102
+ filename = "files/" + file_name,
103
+ repo_type = "space"
104
+ )
105
+ shutil.copy(path_to_file,examples_dir)
106
+
107
+ # path to files
108
+ image_wo_content = examples_dir + "wo_content.png" # image without content
109
+ pdf_blank = examples_dir + "blank.pdf" # blank PDF
110
+ image_blank = examples_dir + "blank.png" # blank image
111
+
112
+ ## get langdetect2Tesseract dictionary
113
+ t = "files/languages_tesseract.csv"
114
+ l = "files/languages_iso.csv"
115
+
116
+ df_t = pd.read_csv(t)
117
+ df_l = pd.read_csv(l)
118
+
119
+ langs_t = df_t["Language"].to_list()
120
+ langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t]
121
+ langs_l = df_l["Language"].to_list()
122
+ langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l]
123
+ langscode_t = df_t["LangCode"].to_list()
124
+ langscode_l = df_l["LangCode"].to_list()
125
+
126
+ Tesseract2langdetect, langdetect2Tesseract = dict(), dict()
127
+ for lang_t, langcode_t in zip(langs_t,langscode_t):
128
+ try:
129
+ if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese"
130
+ index = langs_l.index(lang_t)
131
+ langcode_l = langscode_l[index]
132
+ Tesseract2langdetect[langcode_t] = langcode_l
133
+ except:
134
+ continue
135
+
136
+ langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()}
137
+
138
+ ## model / feature extractor / tokenizer
139
+
140
+ import torch
141
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
142
+
143
+ from transformers import LayoutLMv2ForTokenClassification # LayoutXLMTokenizerFast,
144
+
145
+ model = LayoutLMv2ForTokenClassification.from_pretrained(model_id);
146
+ model.to(device);
147
+
148
+ # feature extractor
149
+ from transformers import LayoutLMv2FeatureExtractor
150
+ feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
151
+
152
+ # tokenizer
153
+ from transformers import AutoTokenizer
154
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
155
+
156
+ # get labels
157
+ id2label = model.config.id2label
158
+ label2id = model.config.label2id
159
+ num_labels = len(id2label)
160
+
161
+ ## General
162
+
163
+ # get text and bounding boxes from an image
164
+ # https://stackoverflow.com/questions/61347755/how-can-i-get-line-coordinates-that-readed-by-tesseract
165
+ # https://medium.com/geekculture/tesseract-ocr-understanding-the-contents-of-documents-beyond-their-text-a98704b7c655
166
+ def get_data_paragraph(results, factor, conf_min=0):
167
+
168
+ data = {}
169
+ for i in range(len(results['line_num'])):
170
+ level = results['level'][i]
171
+ block_num = results['block_num'][i]
172
+ par_num = results['par_num'][i]
173
+ line_num = results['line_num'][i]
174
+ top, left = results['top'][i], results['left'][i]
175
+ width, height = results['width'][i], results['height'][i]
176
+ conf = results['conf'][i]
177
+ text = results['text'][i]
178
+ if not (text == '' or text.isspace()):
179
+ if conf >= conf_min:
180
+ tup = (text, left, top, width, height)
181
+ if block_num in list(data.keys()):
182
+ if par_num in list(data[block_num].keys()):
183
+ if line_num in list(data[block_num][par_num].keys()):
184
+ data[block_num][par_num][line_num].append(tup)
185
+ else:
186
+ data[block_num][par_num][line_num] = [tup]
187
+ else:
188
+ data[block_num][par_num] = {}
189
+ data[block_num][par_num][line_num] = [tup]
190
+ else:
191
+ data[block_num] = {}
192
+ data[block_num][par_num] = {}
193
+ data[block_num][par_num][line_num] = [tup]
194
+
195
+ # get paragraphs dicionnary with list of lines
196
+ par_data = {}
197
+ par_idx = 1
198
+ for _, b in data.items():
199
+ for _, p in b.items():
200
+ line_data = {}
201
+ line_idx = 1
202
+ for _, l in p.items():
203
+ line_data[line_idx] = l
204
+ line_idx += 1
205
+ par_data[par_idx] = line_data
206
+ par_idx += 1
207
+
208
+ # get lines of texts, grouped by paragraph
209
+ texts_pars = list()
210
+ row_indexes = list()
211
+ texts_lines = list()
212
+ texts_lines_par = list()
213
+ row_index = 0
214
+ for _,par in par_data.items():
215
+ count_lines = 0
216
+ lines_par = list()
217
+ for _,line in par.items():
218
+ if count_lines == 0: row_indexes.append(row_index)
219
+ line_text = ' '.join([item[0] for item in line])
220
+ texts_lines.append(line_text)
221
+ lines_par.append(line_text)
222
+ count_lines += 1
223
+ row_index += 1
224
+ # lines.append("\n")
225
+ row_index += 1
226
+ texts_lines_par.append(lines_par)
227
+ texts_pars.append(' '.join(lines_par))
228
+ # lines = lines[:-1]
229
+
230
+ # get paragraphes boxes (par_boxes)
231
+ # get lines boxes (line_boxes)
232
+ par_boxes = list()
233
+ par_idx = 1
234
+ line_boxes, lines_par_boxes = list(), list()
235
+ line_idx = 1
236
+ for _, par in par_data.items():
237
+ xmins, ymins, xmaxs, ymaxs = list(), list(), list(), list()
238
+ line_boxes_par = list()
239
+ count_line_par = 0
240
+ for _, line in par.items():
241
+ xmin, ymin = line[0][1], line[0][2]
242
+ xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
243
+ line_boxes.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
244
+ line_boxes_par.append([int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)])
245
+ xmins.append(xmin)
246
+ ymins.append(ymin)
247
+ xmaxs.append(xmax)
248
+ ymaxs.append(ymax)
249
+ line_idx += 1
250
+ count_line_par += 1
251
+ xmin, ymin, xmax, ymax = min(xmins), min(ymins), max(xmaxs), max(ymaxs)
252
+ par_bbox = [int(xmin/factor), int(ymin/factor), int(xmax/factor), int(ymax/factor)]
253
+ par_boxes.append(par_bbox)
254
+ lines_par_boxes.append(line_boxes_par)
255
+ par_idx += 1
256
+
257
+ return texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes
258
+
259
+ # rescale image to get 300dpi
260
+ def set_image_dpi_resize(image):
261
+ """
262
+ Rescaling image to 300dpi while resizing
263
+ :param image: An image
264
+ :return: A rescaled image
265
+ """
266
+ length_x, width_y = image.size
267
+ factor = min(1, float(1024.0 / length_x))
268
+ size = int(factor * length_x), int(factor * width_y)
269
+ # image_resize = image.resize(size, Image.Resampling.LANCZOS)
270
+ image_resize = image.resize(size, Image.LANCZOS)
271
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='1.png')
272
+ temp_filename = temp_file.name
273
+ image_resize.save(temp_filename, dpi=(300, 300))
274
+ return factor, temp_filename
275
+
276
+ # it is important that each bounding box should be in (upper left, lower right) format.
277
+ # source: https://github.com/NielsRogge/Transformers-Tutorials/issues/129
278
+ def upperleft_to_lowerright(bbox):
279
+ x0, y0, x1, y1 = tuple(bbox)
280
+ if bbox[2] < bbox[0]:
281
+ x0 = bbox[2]
282
+ x1 = bbox[0]
283
+ if bbox[3] < bbox[1]:
284
+ y0 = bbox[3]
285
+ y1 = bbox[1]
286
+ return [x0, y0, x1, y1]
287
+
288
+ # convert boundings boxes (left, top, width, height) format to (left, top, left+widght, top+height) format.
289
+ def convert_box(bbox):
290
+ x, y, w, h = tuple(bbox) # the row comes in (left, top, width, height) format
291
+ return [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
292
+
293
+ # LiLT model gets 1000x10000 pixels images
294
+ def normalize_box(bbox, width, height):
295
+ return [
296
+ int(1000 * (bbox[0] / width)),
297
+ int(1000 * (bbox[1] / height)),
298
+ int(1000 * (bbox[2] / width)),
299
+ int(1000 * (bbox[3] / height)),
300
+ ]
301
+
302
+ # LiLT model gets 1000x10000 pixels images
303
+ def denormalize_box(bbox, width, height):
304
+ return [
305
+ int(width * (bbox[0] / 1000)),
306
+ int(height * (bbox[1] / 1000)),
307
+ int(width* (bbox[2] / 1000)),
308
+ int(height * (bbox[3] / 1000)),
309
+ ]
310
+
311
+ # get back original size
312
+ def original_box(box, original_width, original_height, coco_width, coco_height):
313
+ return [
314
+ int(original_width * (box[0] / coco_width)),
315
+ int(original_height * (box[1] / coco_height)),
316
+ int(original_width * (box[2] / coco_width)),
317
+ int(original_height* (box[3] / coco_height)),
318
+ ]
319
+
320
+ def get_blocks(bboxes_block, categories, texts):
321
+
322
+ # get list of unique block boxes
323
+ bbox_block_dict, bboxes_block_list, bbox_block_prec = dict(), list(), list()
324
+ for count_block, bbox_block in enumerate(bboxes_block):
325
+ if bbox_block != bbox_block_prec:
326
+ bbox_block_indexes = [i for i, bbox in enumerate(bboxes_block) if bbox == bbox_block]
327
+ bbox_block_dict[count_block] = bbox_block_indexes
328
+ bboxes_block_list.append(bbox_block)
329
+ bbox_block_prec = bbox_block
330
+
331
+ # get list of categories and texts by unique block boxes
332
+ category_block_list, text_block_list = list(), list()
333
+ for bbox_block in bboxes_block_list:
334
+ count_block = bboxes_block.index(bbox_block)
335
+ bbox_block_indexes = bbox_block_dict[count_block]
336
+ category_block = np.array(categories, dtype=object)[bbox_block_indexes].tolist()[0]
337
+ category_block_list.append(category_block)
338
+ text_block = np.array(texts, dtype=object)[bbox_block_indexes].tolist()
339
+ text_block = [text.replace("\n","").strip() for text in text_block]
340
+ if id2label[category_block] == "Text" or id2label[category_block] == "Caption" or id2label[category_block] == "Footnote":
341
+ text_block = ' '.join(text_block)
342
+ else:
343
+ text_block = '\n'.join(text_block)
344
+ text_block_list.append(text_block)
345
+
346
+ return bboxes_block_list, category_block_list, text_block_list
347
+
348
+ # function to sort bounding boxes
349
+ def get_sorted_boxes(bboxes):
350
+
351
+ # sort by y from page top to bottom
352
+ sorted_bboxes = sorted(bboxes, key=itemgetter(1), reverse=False)
353
+ y_list = [bbox[1] for bbox in sorted_bboxes]
354
+
355
+ # sort by x from page left to right when boxes with same y
356
+ if len(list(set(y_list))) != len(y_list):
357
+ y_list_duplicates_indexes = dict()
358
+ y_list_duplicates = [item for item, count in collections.Counter(y_list).items() if count > 1]
359
+ for item in y_list_duplicates:
360
+ y_list_duplicates_indexes[item] = [i for i, e in enumerate(y_list) if e == item]
361
+ bbox_list_y_duplicates = sorted(np.array(sorted_bboxes, dtype=object)[y_list_duplicates_indexes[item]].tolist(), key=itemgetter(0), reverse=False)
362
+ np_array_bboxes = np.array(sorted_bboxes)
363
+ np_array_bboxes[y_list_duplicates_indexes[item]] = np.array(bbox_list_y_duplicates)
364
+ sorted_bboxes = np_array_bboxes.tolist()
365
+
366
+ return sorted_bboxes
367
+
368
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
369
+ def sort_data(bboxes, categories, texts):
370
+
371
+ sorted_bboxes = get_sorted_boxes(bboxes)
372
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
373
+ sorted_categories = np.array(categories, dtype=object)[sorted_bboxes_indexes].tolist()
374
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
375
+
376
+ return sorted_bboxes, sorted_categories, sorted_texts
377
+
378
+ # sort data from y = 0 to end of page (and after, x=0 to end of page when necessary)
379
+ def sort_data_wo_labels(bboxes, texts):
380
+
381
+ sorted_bboxes = get_sorted_boxes(bboxes)
382
+ sorted_bboxes_indexes = [bboxes.index(bbox) for bbox in sorted_bboxes]
383
+ sorted_texts = np.array(texts, dtype=object)[sorted_bboxes_indexes].tolist()
384
+
385
+ return sorted_bboxes, sorted_texts
386
+
387
+ ## PDF processing
388
+
389
+ # get filename and images of PDF pages
390
+ def pdf_to_images(uploaded_pdf):
391
+
392
+ # Check if None object
393
+ if uploaded_pdf is None:
394
+ path_to_file = pdf_blank
395
+ filename = path_to_file.replace(examples_dir,"")
396
+ msg = "Invalid PDF file."
397
+ images = [Image.open(image_blank)]
398
+ else:
399
+ # path to the uploaded PDF
400
+ path_to_file = uploaded_pdf.name
401
+ filename = path_to_file.replace("/tmp/","")
402
+
403
+ try:
404
+ PdfReader(path_to_file)
405
+ except PdfReadError:
406
+ path_to_file = pdf_blank
407
+ filename = path_to_file.replace(examples_dir,"")
408
+ msg = "Invalid PDF file."
409
+ images = [Image.open(image_blank)]
410
+ else:
411
+ try:
412
+ images = convert_from_path(path_to_file, last_page=max_imgboxes)
413
+ num_imgs = len(images)
414
+ msg = f'The PDF "{filename}" was converted into {num_imgs} images.'
415
+ except:
416
+ msg = f'Error with the PDF "{filename}": it was not converted into images.'
417
+ images = [Image.open(image_wo_content)]
418
+
419
+ return filename, msg, images
420
+
421
+ # Extraction of image data (text and bounding boxes)
422
+ def extraction_data_from_image(images):
423
+
424
+ num_imgs = len(images)
425
+
426
+ if num_imgs > 0:
427
+
428
+ # https://pyimagesearch.com/2021/11/15/tesseract-page-segmentation-modes-psms-explained-how-to-improve-your-ocr-accuracy/
429
+ custom_config = r'--oem 3 --psm 3 -l eng' # default config PyTesseract: --oem 3 --psm 3 -l eng+deu+fra+jpn+por+spa+rus+hin+chi_sim
430
+ results, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes, images_pixels = dict(), dict(), dict(), dict(), dict(), dict(), dict(), dict(), dict()
431
+ images_ids_list, texts_lines_list, texts_pars_list, texts_lines_par_list, par_boxes_list, line_boxes_list, lines_par_boxes_list, images_list, images_pixels_list, page_no_list, num_pages_list = list(), list(), list(), list(), list(), list(), list(), list(), list(), list(), list()
432
+
433
+ try:
434
+ for i,image in enumerate(images):
435
+ # image preprocessing
436
+ # https://docs.opencv.org/3.0-beta/doc/py_tutorials/py_imgproc/py_thresholding/py_thresholding.html
437
+ img = image.copy()
438
+ factor, path_to_img = set_image_dpi_resize(img) # Rescaling to 300dpi
439
+ img = Image.open(path_to_img)
440
+ img = np.array(img, dtype='uint8') # convert PIL to cv2
441
+ img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # gray scale image
442
+ ret,img = cv2.threshold(img,127,255,cv2.THRESH_BINARY)
443
+
444
+ # OCR PyTesseract | get langs of page
445
+ txt = pytesseract.image_to_string(img, config=custom_config)
446
+ txt = txt.strip().lower()
447
+ txt = re.sub(r" +", " ", txt) # multiple space
448
+ txt = re.sub(r"(\n\s*)+\n+", "\n", txt) # multiple line
449
+ # txt = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
450
+ try:
451
+ langs = detect_langs(txt)
452
+ langs = [langdetect2Tesseract[langs[i].lang] for i in range(len(langs))]
453
+ langs_string = '+'.join(langs)
454
+ except:
455
+ langs_string = "eng"
456
+ langs_string += '+osd'
457
+ custom_config = f'--oem 3 --psm 3 -l {langs_string}' # default config PyTesseract: --oem 3 --psm 3
458
+
459
+ # OCR PyTesseract | get data
460
+ results[i] = pytesseract.image_to_data(img, config=custom_config, output_type=pytesseract.Output.DICT)
461
+ # results[i] = os.popen(f'tesseract {img_filepath} - {custom_config}').read()
462
+
463
+ # get image pixels
464
+ images_pixels[i] = feature_extractor(images[i], return_tensors="pt").pixel_values
465
+
466
+ texts_lines[i], texts_pars[i], texts_lines_par[i], row_indexes[i], par_boxes[i], line_boxes[i], lines_par_boxes[i] = get_data_paragraph(results[i], factor, conf_min=0)
467
+ texts_lines_list.append(texts_lines[i])
468
+ texts_pars_list.append(texts_pars[i])
469
+ texts_lines_par_list.append(texts_lines_par[i])
470
+ par_boxes_list.append(par_boxes[i])
471
+ line_boxes_list.append(line_boxes[i])
472
+ lines_par_boxes_list.append(lines_par_boxes[i])
473
+ images_ids_list.append(i)
474
+ images_pixels_list.append(images_pixels[i])
475
+ images_list.append(images[i])
476
+ page_no_list.append(i)
477
+ num_pages_list.append(num_imgs)
478
+
479
+ except:
480
+ print(f"There was an error within the extraction of PDF text by the OCR!")
481
+ else:
482
+ from datasets import Dataset
483
+ dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "images_pixels": images_pixels_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts_line": texts_lines_list, "texts_par": texts_pars_list, "texts_lines_par": texts_lines_par_list, "bboxes_par": par_boxes_list, "bboxes_lines_par":lines_par_boxes_list})
484
+
485
+
486
+ # print(f"The text data was successfully extracted by the OCR!")
487
+
488
+ return dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes
489
+
490
+ ## Inference
491
+
492
+ def prepare_inference_features_paragraph(example, cls_box = cls_box, sep_box = sep_box):
493
+
494
+ images_ids_list, chunks_ids_list, input_ids_list, attention_mask_list, bb_list, images_pixels_list = list(), list(), list(), list(), list(), list()
495
+
496
+ # get batch
497
+ # batch_page_hash = example["page_hash"]
498
+ batch_images_ids = example["images_ids"]
499
+ batch_images = example["images"]
500
+ batch_images_pixels = example["images_pixels"]
501
+ batch_bboxes_par = example["bboxes_par"]
502
+ batch_texts_par = example["texts_par"]
503
+ batch_images_size = [image.size for image in batch_images]
504
+
505
+ batch_width, batch_height = [image_size[0] for image_size in batch_images_size], [image_size[1] for image_size in batch_images_size]
506
+
507
+ # add a dimension if not a batch but only one image
508
+ if not isinstance(batch_images_ids, list):
509
+ batch_images_ids = [batch_images_ids]
510
+ batch_images = [batch_images]
511
+ batch_images_pixels = [batch_images_pixels]
512
+ batch_bboxes_par = [batch_bboxes_par]
513
+ batch_texts_par = [batch_texts_par]
514
+ batch_width, batch_height = [batch_width], [batch_height]
515
+
516
+ # process all images of the batch
517
+ for num_batch, (image_id, image_pixels, boxes, texts_par, width, height) in enumerate(zip(batch_images_ids, batch_images_pixels, batch_bboxes_par, batch_texts_par, batch_width, batch_height)):
518
+ tokens_list = []
519
+ bboxes_list = []
520
+
521
+ # add a dimension if only on image
522
+ if not isinstance(texts_par, list):
523
+ texts_par, boxes = [texts_par], [boxes]
524
+
525
+ # convert boxes to original
526
+ normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
527
+
528
+ # sort boxes with texts
529
+ # we want sorted lists from top to bottom of the image
530
+ boxes, texts_par = sort_data_wo_labels(normalize_bboxes_par, texts_par)
531
+
532
+ count = 0
533
+ for box, text_par in zip(boxes, texts_par):
534
+ tokens_par = tokenizer.tokenize(text_par)
535
+ num_tokens_par = len(tokens_par) # get number of tokens
536
+ tokens_list.extend(tokens_par)
537
+ bboxes_list.extend([box] * num_tokens_par) # number of boxes must be the same as the number of tokens
538
+
539
+ # use of return_overflowing_tokens=True / stride=doc_stride
540
+ # to get parts of image with overlap
541
+ # source: https://huggingface.co/course/chapter6/3b?fw=tf#handling-long-contexts
542
+ encodings = tokenizer(" ".join(texts_par),
543
+ truncation=True,
544
+ padding="max_length",
545
+ max_length=max_length,
546
+ stride=doc_stride,
547
+ return_overflowing_tokens=True,
548
+ return_offsets_mapping=True
549
+ )
550
+
551
+ otsm = encodings.pop("overflow_to_sample_mapping")
552
+ offset_mapping = encodings.pop("offset_mapping")
553
+
554
+ # Let's label those examples and get their boxes
555
+ sequence_length_prev = 0
556
+ for i, offsets in enumerate(offset_mapping):
557
+ # truncate tokens, boxes and labels based on length of chunk - 2 (special tokens <s> and </s>)
558
+ sequence_length = len(encodings.input_ids[i]) - 2
559
+ if i == 0: start = 0
560
+ else: start += sequence_length_prev - doc_stride
561
+ end = start + sequence_length
562
+ sequence_length_prev = sequence_length
563
+
564
+ # get tokens, boxes and labels of this image chunk
565
+ bb = [cls_box] + bboxes_list[start:end] + [sep_box]
566
+
567
+ # as the last chunk can have a length < max_length
568
+ # we must to add [tokenizer.pad_token] (tokens), [sep_box] (boxes) and [-100] (labels)
569
+ if len(bb) < max_length:
570
+ bb = bb + [sep_box] * (max_length - len(bb))
571
+
572
+ # append results
573
+ input_ids_list.append(encodings["input_ids"][i])
574
+ attention_mask_list.append(encodings["attention_mask"][i])
575
+ bb_list.append(bb)
576
+ images_ids_list.append(image_id)
577
+ chunks_ids_list.append(i)
578
+ images_pixels_list.append(image_pixels)
579
+
580
+ return {
581
+ "images_ids": images_ids_list,
582
+ "chunk_ids": chunks_ids_list,
583
+ "input_ids": input_ids_list,
584
+ "attention_mask": attention_mask_list,
585
+ "normalized_bboxes": bb_list,
586
+ "images_pixels": images_pixels_list
587
+ }
588
+
589
+ from torch.utils.data import Dataset
590
+
591
+ class CustomDataset(Dataset):
592
+ def __init__(self, dataset, tokenizer):
593
+ self.dataset = dataset
594
+ self.tokenizer = tokenizer
595
+
596
+ def __len__(self):
597
+ return len(self.dataset)
598
+
599
+ def __getitem__(self, idx):
600
+ # get item
601
+ example = self.dataset[idx]
602
+ encoding = dict()
603
+ encoding["images_ids"] = example["images_ids"]
604
+ encoding["chunk_ids"] = example["chunk_ids"]
605
+ encoding["input_ids"] = example["input_ids"]
606
+ encoding["attention_mask"] = example["attention_mask"]
607
+ encoding["bbox"] = example["normalized_bboxes"]
608
+ encoding["images_pixels"] = example["images_pixels"]
609
+
610
+ return encoding
611
+
612
+ import torch.nn.functional as F
613
+
614
+ # get predictions at token level
615
+ def predictions_token_level(images, custom_encoded_dataset):
616
+
617
+ num_imgs = len(images)
618
+ if num_imgs > 0:
619
+
620
+ chunk_ids, input_ids, bboxes, pixels_values, outputs, token_predictions = dict(), dict(), dict(), dict(), dict(), dict()
621
+ images_ids_list = list()
622
+
623
+ for i,encoding in enumerate(custom_encoded_dataset):
624
+
625
+ # get custom encoded data
626
+ image_id = encoding['images_ids']
627
+ chunk_id = encoding['chunk_ids']
628
+ input_id = torch.tensor(encoding['input_ids'])[None]
629
+ attention_mask = torch.tensor(encoding['attention_mask'])[None]
630
+ bbox = torch.tensor(encoding['bbox'])[None]
631
+ pixel_values = torch.tensor(encoding["images_pixels"])
632
+
633
+ # save data in dictionnaries
634
+ if image_id not in images_ids_list: images_ids_list.append(image_id)
635
+
636
+ if image_id in chunk_ids: chunk_ids[image_id].append(chunk_id)
637
+ else: chunk_ids[image_id] = [chunk_id]
638
+
639
+ if image_id in input_ids: input_ids[image_id].append(input_id)
640
+ else: input_ids[image_id] = [input_id]
641
+
642
+ if image_id in bboxes: bboxes[image_id].append(bbox)
643
+ else: bboxes[image_id] = [bbox]
644
+
645
+ if image_id in pixels_values: pixels_values[image_id].append(pixel_values)
646
+ else: pixels_values[image_id] = [pixel_values]
647
+
648
+ # get prediction with forward pass
649
+ with torch.no_grad():
650
+ output = model(
651
+ input_ids=input_id.to(device),
652
+ attention_mask=attention_mask.to(device),
653
+ bbox=bbox.to(device),
654
+ image=pixel_values.to(device)
655
+ )
656
+
657
+ # save probabilities of predictions in dictionnary
658
+ if image_id in outputs: outputs[image_id].append(F.softmax(output.logits.squeeze(), dim=-1))
659
+ else: outputs[image_id] = [F.softmax(output.logits.squeeze(), dim=-1)]
660
+
661
+ return outputs, images_ids_list, chunk_ids, input_ids, bboxes
662
+
663
+ else:
664
+ print("An error occurred while getting predictions!")
665
+
666
+ from functools import reduce
667
+
668
+ # Get predictions (line level)
669
+ def predictions_paragraph_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
670
+
671
+ ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
672
+ bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
673
+
674
+ if len(images_ids_list) > 0:
675
+
676
+ for i, image_id in enumerate(images_ids_list):
677
+
678
+ # get image information
679
+ images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
680
+ image = images_list[0]
681
+ width, height = image.size
682
+
683
+ # get data
684
+ chunk_ids_list = chunk_ids[image_id]
685
+ outputs_list = outputs[image_id]
686
+ input_ids_list = input_ids[image_id]
687
+ bboxes_list = bboxes[image_id]
688
+
689
+ # create zeros tensors
690
+ ten_probs = torch.zeros((outputs_list[0].shape[0] - 2)*len(outputs_list), outputs_list[0].shape[1])
691
+ ten_input_ids = torch.ones(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list)), dtype =int)
692
+ ten_bboxes = torch.zeros(size=(1, (outputs_list[0].shape[0] - 2)*len(outputs_list), 4), dtype =int)
693
+
694
+ if len(outputs_list) > 1:
695
+
696
+ for num_output, (output, input_id, bbox) in enumerate(zip(outputs_list, input_ids_list, bboxes_list)):
697
+ start = num_output*(max_length - 2) - max(0,num_output)*doc_stride
698
+ end = start + (max_length - 2)
699
+
700
+ if num_output == 0:
701
+ ten_probs[start:end,:] += output[1:-1]
702
+ ten_input_ids[:,start:end] = input_id[:,1:-1]
703
+ ten_bboxes[:,start:end,:] = bbox[:,1:-1,:]
704
+ else:
705
+ ten_probs[start:start + doc_stride,:] += output[1:1 + doc_stride]
706
+ ten_probs[start:start + doc_stride,:] = ten_probs[start:start + doc_stride,:] * 0.5
707
+ ten_probs[start + doc_stride:end,:] += output[1 + doc_stride:-1]
708
+
709
+ ten_input_ids[:,start:start + doc_stride] = input_id[:,1:1 + doc_stride]
710
+ ten_input_ids[:,start + doc_stride:end] = input_id[:,1 + doc_stride:-1]
711
+
712
+ ten_bboxes[:,start:start + doc_stride,:] = bbox[:,1:1 + doc_stride,:]
713
+ ten_bboxes[:,start + doc_stride:end,:] = bbox[:,1 + doc_stride:-1,:]
714
+
715
+ else:
716
+ ten_probs += outputs_list[0][1:-1]
717
+ ten_input_ids = input_ids_list[0][:,1:-1]
718
+ ten_bboxes = bboxes_list[0][:,1:-1]
719
+
720
+ ten_probs_list, ten_input_ids_list, ten_bboxes_list = ten_probs.tolist(), ten_input_ids.tolist()[0], ten_bboxes.tolist()[0]
721
+ bboxes_list = list()
722
+ input_ids_dict, probs_dict = dict(), dict()
723
+ bbox_prev = [-100, -100, -100, -100]
724
+ for probs, input_id, bbox in zip(ten_probs_list, ten_input_ids_list, ten_bboxes_list):
725
+ bbox = denormalize_box(bbox, width, height)
726
+ if bbox != bbox_prev and bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
727
+ bboxes_list.append(bbox)
728
+ input_ids_dict[str(bbox)] = [input_id]
729
+ probs_dict[str(bbox)] = [probs]
730
+ elif bbox != cls_box and bbox != sep_box and bbox[0] != bbox[2] and bbox[1] != bbox[3]:
731
+ input_ids_dict[str(bbox)].append(input_id)
732
+ probs_dict[str(bbox)].append(probs)
733
+ bbox_prev = bbox
734
+
735
+ probs_bbox = dict()
736
+ for i,bbox in enumerate(bboxes_list):
737
+ probs = probs_dict[str(bbox)]
738
+ probs = np.array(probs).T.tolist()
739
+
740
+ probs_label = list()
741
+ for probs_list in probs:
742
+ prob_label = reduce(lambda x, y: x*y, probs_list)
743
+ prob_label = prob_label**(1./(len(probs_list))) # normalization
744
+ probs_label.append(prob_label)
745
+ max_value = max(probs_label)
746
+ max_index = probs_label.index(max_value)
747
+ probs_bbox[str(bbox)] = max_index
748
+
749
+ bboxes_list_dict[image_id] = bboxes_list
750
+ input_ids_dict_dict[image_id] = input_ids_dict
751
+ probs_dict_dict[image_id] = probs_bbox
752
+
753
+ df[image_id] = pd.DataFrame()
754
+ df[image_id]["bboxes"] = bboxes_list
755
+ df[image_id]["texts"] = [tokenizer.decode(input_ids_dict[str(bbox)]) for bbox in bboxes_list]
756
+ df[image_id]["labels"] = [id2label[probs_bbox[str(bbox)]] for bbox in bboxes_list]
757
+
758
+ return probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df
759
+
760
+ else:
761
+ print("An error occurred while getting predictions!")
762
+
763
+ # Get labeled images with lines bounding boxes
764
+ def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
765
+
766
+ labeled_images = list()
767
+
768
+ for i, image_id in enumerate(images_ids_list):
769
+
770
+ # get image
771
+ images_list = dataset.filter(lambda example: example["images_ids"] == image_id)["images"]
772
+ image = images_list[0]
773
+ width, height = image.size
774
+
775
+ # get predicted boxes and labels
776
+ bboxes_list = bboxes_list_dict[image_id]
777
+ probs_bbox = probs_dict_dict[image_id]
778
+
779
+ draw = ImageDraw.Draw(image)
780
+ # https://stackoverflow.com/questions/66274858/choosing-a-pil-imagefont-by-font-name-rather-than-filename-and-cross-platform-f
781
+ font = font_manager.FontProperties(family='sans-serif', weight='bold')
782
+ font_file = font_manager.findfont(font)
783
+ font_size = 30
784
+ font = ImageFont.truetype(font_file, font_size)
785
+
786
+ for bbox in bboxes_list:
787
+ predicted_label = id2label[probs_bbox[str(bbox)]]
788
+ draw.rectangle(bbox, outline=label2color[predicted_label])
789
+ draw.text((bbox[0] + 10, bbox[1] - font_size), text=predicted_label, fill=label2color[predicted_label], font=font)
790
+
791
+ labeled_images.append(image)
792
+
793
+ return labeled_images
794
+
795
+ # get data of encoded chunk
796
+ def get_encoded_chunk_inference(index_chunk=None):
797
+
798
+ # get datasets
799
+ example = dataset
800
+ encoded_example = encoded_dataset
801
+
802
+ # get randomly a document in dataset
803
+ if index_chunk == None: index_chunk = random.randint(0, len(encoded_example)-1)
804
+ encoded_example = encoded_example[index_chunk]
805
+ encoded_image_ids = encoded_example["images_ids"]
806
+
807
+ # get the image
808
+ example = example.filter(lambda example: example["images_ids"] == encoded_image_ids)[0]
809
+ image = example["images"] # original image
810
+ width, height = image.size
811
+ page_no = example["page_no"]
812
+ num_pages = example["num_pages"]
813
+
814
+ # get boxes, texts, categories
815
+ bboxes, input_ids = encoded_example["normalized_bboxes"][1:-1], encoded_example["input_ids"][1:-1]
816
+ bboxes = [denormalize_box(bbox, width, height) for bbox in bboxes]
817
+ num_tokens = len(input_ids) + 2
818
+
819
+ # get unique bboxes and corresponding labels
820
+ bboxes_list, input_ids_list = list(), list()
821
+ input_ids_dict = dict()
822
+ bbox_prev = [-100, -100, -100, -100]
823
+ for i, (bbox, input_id) in enumerate(zip(bboxes, input_ids)):
824
+ if bbox != bbox_prev:
825
+ bboxes_list.append(bbox)
826
+ input_ids_dict[str(bbox)] = [input_id]
827
+ else:
828
+ input_ids_dict[str(bbox)].append(input_id)
829
+
830
+ # start_indexes_list.append(i)
831
+ bbox_prev = bbox
832
+
833
+ # do not keep "</s><pad><pad>..."
834
+ if input_ids_dict[str(bboxes_list[-1])][0] == (tokenizer.convert_tokens_to_ids('</s>')):
835
+ del input_ids_dict[str(bboxes_list[-1])]
836
+ bboxes_list = bboxes_list[:-1]
837
+
838
+ # get texts by line
839
+ input_ids_list = input_ids_dict.values()
840
+ texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
841
+
842
+ # display DataFrame
843
+ df = pd.DataFrame({"texts": texts_list, "input_ids": input_ids_list, "bboxes": bboxes_list})
844
+
845
+ return image, df, num_tokens, page_no, num_pages
846
+
847
+ # display chunk of PDF image and its data
848
+ def display_chunk_paragraphs_inference(index_chunk=None):
849
+
850
+ # get image and image data
851
+ image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
852
+
853
+ # get data from dataframe
854
+ input_ids = df["input_ids"]
855
+ texts = df["texts"]
856
+ bboxes = df["bboxes"]
857
+
858
+ print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
859
+
860
+ # display image with bounding boxes
861
+ print(">> PDF image with bounding boxes of paragraphs\n")
862
+ draw = ImageDraw.Draw(image)
863
+
864
+ labels = list()
865
+ for box, text in zip(bboxes, texts):
866
+ color = "red"
867
+ draw.rectangle(box, outline=color)
868
+
869
+ # resize image to original
870
+ width, height = image.size
871
+ image = image.resize((int(0.5*width), int(0.5*height)))
872
+
873
+ # convert to cv and display
874
+ img = np.array(image, dtype='uint8') # PIL to cv2
875
+ cv2_imshow(img)
876
+ cv2.waitKey(0)
877
+
878
+ # display image dataframe
879
+ print("\n>> Dataframe of annotated paragraphs\n")
880
+ cols = ["texts", "bboxes"]
881
+ df = df[cols]
882
+ display(df)
files/languages_iso.csv ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,LangCode
2
+ Abkhazian,ab
3
+ Afar,aa
4
+ Afrikaans,af
5
+ Akan,ak
6
+ Albanian,sq
7
+ Amharic,am
8
+ Arabic,ar
9
+ Aragonese,an
10
+ Armenian,hy
11
+ Assamese,as
12
+ Avaric,av
13
+ Avestan,ae
14
+ Aymara,ay
15
+ Azerbaijani,az
16
+ Bambara,bm
17
+ Bashkir,ba
18
+ Basque,eu
19
+ Belarusian,be
20
+ Bengali,bn
21
+ Bislama,bi
22
+ Bosnian,bs
23
+ Breton,br
24
+ Bulgarian,bg
25
+ Burmese,my
26
+ "Catalan, Valencian",ca
27
+ Chamorro,ch
28
+ Chechen,ce
29
+ "Chichewa, Chewa, Nyanja",ny
30
+ Chinese,zh
31
+ "Church Slavonic, Old Slavonic, Old Church Slavonic",cu
32
+ Chuvash,cv
33
+ Cornish,kw
34
+ Corsican,co
35
+ Cree,cr
36
+ Croatian,hr
37
+ Czech,cs
38
+ Danish,da
39
+ "Divehi, Dhivehi, Maldivian",dv
40
+ "Dutch, Flemish",nl
41
+ Dzongkha,dz
42
+ English,en
43
+ Esperanto,eo
44
+ Estonian,et
45
+ Ewe,ee
46
+ Faroese,fo
47
+ Fijian,fj
48
+ Finnish,fi
49
+ French,fr
50
+ Western Frisian,fy
51
+ Fulah,ff
52
+ "Gaelic, Scottish Gaelic",gd
53
+ Galician,gl
54
+ Ganda,lg
55
+ Georgian,ka
56
+ German,de
57
+ "Greek, Modern (1453–)",el
58
+ "Kalaallisut, Greenlandic",kl
59
+ Guarani,gn
60
+ Gujarati,gu
61
+ "Haitian, Haitian Creole",ht
62
+ Hausa,ha
63
+ Hebrew,he
64
+ Herero,hz
65
+ Hindi,hi
66
+ Hiri Motu,ho
67
+ Hungarian,hu
68
+ Icelandic,is
69
+ Ido,io
70
+ Igbo,ig
71
+ Indonesian,id
72
+ Interlingua (International Auxiliary Language Association),ia
73
+ "Interlingue, Occidental",ie
74
+ Inuktitut,iu
75
+ Inupiaq,ik
76
+ Irish,ga
77
+ Italian,it
78
+ Japanese,ja
79
+ Javanese,jv
80
+ Kannada,kn
81
+ Kanuri,kr
82
+ Kashmiri,ks
83
+ Kazakh,kk
84
+ Central Khmer,km
85
+ "Kikuyu, Gikuyu",ki
86
+ Kinyarwanda,rw
87
+ "Kirghiz, Kyrgyz",ky
88
+ Komi,kv
89
+ Kongo,kg
90
+ Korean,ko
91
+ "Kuanyama, Kwanyama",kj
92
+ Kurdish,ku
93
+ Lao,lo
94
+ Latin,la
95
+ Latvian,lv
96
+ "Limburgan, Limburger, Limburgish",li
97
+ Lingala,ln
98
+ Lithuanian,lt
99
+ Luba-Katanga,lu
100
+ "Luxembourgish, Letzeburgesch",lb
101
+ Macedonian,mk
102
+ Malagasy,mg
103
+ Malay,ms
104
+ Malayalam,ml
105
+ Maltese,mt
106
+ Manx,gv
107
+ Maori,mi
108
+ Marathi,mr
109
+ Marshallese,mh
110
+ Mongolian,mn
111
+ Nauru,na
112
+ "Navajo, Navaho",nv
113
+ North Ndebele,nd
114
+ South Ndebele,nr
115
+ Ndonga,ng
116
+ Nepali,ne
117
+ Norwegian,no
118
+ Norwegian Bokmål,nb
119
+ Norwegian Nynorsk,nn
120
+ "Sichuan Yi, Nuosu",ii
121
+ Occitan,oc
122
+ Ojibwa,oj
123
+ Oriya,or
124
+ Oromo,om
125
+ "Ossetian, Ossetic",os
126
+ Pali,pi
127
+ "Pashto, Pushto",ps
128
+ Persian,fa
129
+ Polish,pl
130
+ Portuguese,pt
131
+ "Punjabi, Panjabi",pa
132
+ Quechua,qu
133
+ "Romanian, Moldavian, Moldovan",ro
134
+ Romansh,rm
135
+ Rundi,rn
136
+ Russian,ru
137
+ Northern Sami,se
138
+ Samoan,sm
139
+ Sango,sg
140
+ Sanskrit,sa
141
+ Sardinian,sc
142
+ Serbian,sr
143
+ Shona,sn
144
+ Sindhi,sd
145
+ "Sinhala, Sinhalese",si
146
+ Slovak,sk
147
+ Slovenian,sl
148
+ Somali,so
149
+ Southern Sotho,st
150
+ "Spanish, Castilian",es
151
+ Sundanese,su
152
+ Swahili,sw
153
+ Swati,ss
154
+ Swedish,sv
155
+ Tagalog,tl
156
+ Tahitian,ty
157
+ Tajik,tg
158
+ Tamil,ta
159
+ Tatar,tt
160
+ Telugu,te
161
+ Thai,th
162
+ Tibetan,bo
163
+ Tigrinya,ti
164
+ Tonga (Tonga Islands),to
165
+ Tsonga,ts
166
+ Tswana,tn
167
+ Turkish,tr
168
+ Turkmen,tk
169
+ Twi,tw
170
+ "Uighur, Uyghur",ug
171
+ Ukrainian,uk
172
+ Urdu,ur
173
+ Uzbek,uz
174
+ Venda,ve
175
+ Vietnamese,vi
176
+ Volapük,vo
177
+ Walloon,wa
178
+ Welsh,cy
179
+ Wolof,wo
180
+ Xhosa,xh
181
+ Yiddish,yi
182
+ Yoruba,yo
183
+ "Zhuang, Chuang",za
184
+ Zulu,zu
files/languages_tesseract.csv ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Language,LangCode
2
+ Afrikaans,afr
3
+ Amharic,amh
4
+ Arabic,ara
5
+ Assamese,asm
6
+ Azerbaijani,aze
7
+ Azerbaijani - Cyrilic,aze_cyrl
8
+ Belarusian,bel
9
+ Bengali,ben
10
+ Tibetan,bod
11
+ Bosnian,bos
12
+ Breton,bre
13
+ Bulgarian,bul
14
+ Catalan; Valencian,cat
15
+ Cebuano,ceb
16
+ Czech,ces
17
+ Chinese - Simplified,chi_sim
18
+ Chinese - Traditional,chi_tra
19
+ Cherokee,chr
20
+ Corsican,cos
21
+ Welsh,cym
22
+ Danish,dan
23
+ Danish - Fraktur (contrib),dan_frak
24
+ German,deu
25
+ German - Fraktur (contrib),deu_frak
26
+ Dzongkha,dzo
27
+ "Greek, Modern (1453-)",ell
28
+ English,eng
29
+ "English, Middle (1100-1500)",enm
30
+ Esperanto,epo
31
+ Math / equation detection module,equ
32
+ Estonian,est
33
+ Basque,eus
34
+ Faroese,fao
35
+ Persian,fas
36
+ Filipino (old - Tagalog),fil
37
+ Finnish,fin
38
+ French,fra
39
+ German - Fraktur,frk
40
+ "French, Middle (ca.1400-1600)",frm
41
+ Western Frisian,fry
42
+ Scottish Gaelic,gla
43
+ Irish,gle
44
+ Galician,glg
45
+ "Greek, Ancient (to 1453) (contrib)",grc
46
+ Gujarati,guj
47
+ Haitian; Haitian Creole,hat
48
+ Hebrew,heb
49
+ Hindi,hin
50
+ Croatian,hrv
51
+ Hungarian,hun
52
+ Armenian,hye
53
+ Inuktitut,iku
54
+ Indonesian,ind
55
+ Icelandic,isl
56
+ Italian,ita
57
+ Italian - Old,ita_old
58
+ Javanese,jav
59
+ Japanese,jpn
60
+ Kannada,kan
61
+ Georgian,kat
62
+ Georgian - Old,kat_old
63
+ Kazakh,kaz
64
+ Central Khmer,khm
65
+ Kirghiz; Kyrgyz,kir
66
+ Kurmanji (Kurdish - Latin Script),kmr
67
+ Korean,kor
68
+ Korean (vertical),kor_vert
69
+ Kurdish (Arabic Script),kur
70
+ Lao,lao
71
+ Latin,lat
72
+ Latvian,lav
73
+ Lithuanian,lit
74
+ Luxembourgish,ltz
75
+ Malayalam,mal
76
+ Marathi,mar
77
+ Macedonian,mkd
78
+ Maltese,mlt
79
+ Mongolian,mon
80
+ Maori,mri
81
+ Malay,msa
82
+ Burmese,mya
83
+ Nepali,nep
84
+ Dutch; Flemish,nld
85
+ Norwegian,nor
86
+ Occitan (post 1500),oci
87
+ Oriya,ori
88
+ Orientation and script detection module,osd
89
+ Panjabi; Punjabi,pan
90
+ Polish,pol
91
+ Portuguese,por
92
+ Pushto; Pashto,pus
93
+ Quechua,que
94
+ Romanian; Moldavian; Moldovan,ron
95
+ Russian,rus
96
+ Sanskrit,san
97
+ Sinhala; Sinhalese,sin
98
+ Slovak,slk
99
+ Slovak - Fraktur (contrib),slk_frak
100
+ Slovenian,slv
101
+ Sindhi,snd
102
+ Spanish; Castilian,spa
103
+ Spanish; Castilian - Old,spa_old
104
+ Albanian,sqi
105
+ Serbian,srp
106
+ Serbian - Latin,srp_latn
107
+ Sundanese,sun
108
+ Swahili,swa
109
+ Swedish,swe
110
+ Syriac,syr
111
+ Tamil,tam
112
+ Tatar,tat
113
+ Telugu,tel
114
+ Tajik,tgk
115
+ Tagalog (new - Filipino),tgl
116
+ Thai,tha
117
+ Tigrinya,tir
118
+ Tonga,ton
119
+ Turkish,tur
120
+ Uighur; Uyghur,uig
121
+ Ukrainian,ukr
122
+ Urdu,urd
123
+ Uzbek,uzb
124
+ Uzbek - Cyrilic,uzb_cyrl
125
+ Vietnamese,vie
126
+ Yiddish,yid
127
+ Yoruba,yor
files/template.pdf ADDED
Binary file (29.4 kB). View file
 
files/wo_content.png ADDED
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr-all
2
+ poppler-utils
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ datasets
4
+ pytesseract
5
+ opencv-python
6
+ pdf2image
7
+ pypdf
8
+ langdetect
9
+ gradio