Spaces:
Build error
Build error
Upload 7 files
Browse files- .gitattributes +3 -0
- app.py +17 -10
- examples/ASICS_Catalog.pdf +3 -0
- examples/M8_Private_Sale_p2.pdf +3 -0
- examples/flexpocket.pdf +3 -0
- main.py +27 -12
- requirements.txt +2 -3
.gitattributes
CHANGED
@@ -36,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
36 |
catalogue/ASICS_Catalog.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
catalogue/flexpocket.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
catalogue/M8_Private_Sale_p2.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
36 |
catalogue/ASICS_Catalog.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
catalogue/flexpocket.pdf filter=lfs diff=lfs merge=lfs -text
|
38 |
catalogue/M8_Private_Sale_p2.pdf filter=lfs diff=lfs merge=lfs -text
|
39 |
+
examples/ASICS_Catalog.pdf filter=lfs diff=lfs merge=lfs -text
|
40 |
+
examples/flexpocket.pdf filter=lfs diff=lfs merge=lfs -text
|
41 |
+
examples/M8_Private_Sale_p2.pdf filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,25 +1,32 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
import shutil
|
4 |
import main
|
5 |
|
|
|
|
|
|
|
|
|
6 |
def predict_from_pdf(pdf_file):
|
7 |
upload_dir = "./catalogue/"
|
8 |
os.makedirs(upload_dir, exist_ok=True)
|
9 |
-
|
10 |
try:
|
|
|
11 |
dest_path = os.path.join(upload_dir, pdf_file.name)
|
12 |
with open(dest_path, "wb") as f:
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
|
|
16 |
return df, response
|
17 |
except Exception as e:
|
18 |
return None, f"Error: {str(e)}"
|
19 |
|
|
|
20 |
pdf_examples = [
|
21 |
-
["
|
22 |
-
["
|
23 |
]
|
24 |
|
25 |
demo = gr.Interface(
|
@@ -28,9 +35,9 @@ demo = gr.Interface(
|
|
28 |
outputs=["json", "text"],
|
29 |
examples=pdf_examples,
|
30 |
title="Open Source PDF Catalog Parser",
|
31 |
-
description="Efficient PDF catalog processing using
|
32 |
-
article="Uses MinerU for layout analysis and DeepSeek-
|
33 |
)
|
34 |
|
35 |
if __name__ == "__main__":
|
36 |
-
demo.queue().launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
|
|
3 |
import main
|
4 |
|
5 |
+
#os.environ["CUDA_VISIBLE_DEVICES"]='0'
|
6 |
+
#os.environ["USE_GPU"]="True"
|
7 |
+
|
8 |
+
|
9 |
def predict_from_pdf(pdf_file):
|
10 |
upload_dir = "./catalogue/"
|
11 |
os.makedirs(upload_dir, exist_ok=True)
|
12 |
+
|
13 |
try:
|
14 |
+
# Save the uploaded file to a temporary location
|
15 |
dest_path = os.path.join(upload_dir, pdf_file.name)
|
16 |
with open(dest_path, "wb") as f:
|
17 |
+
with open(pdf_file.name, "rb") as uploaded_file:
|
18 |
+
f.write(uploaded_file.read())
|
19 |
+
|
20 |
+
# Process the PDF
|
21 |
+
df, response = main.process_pdf_catalog(dest_path)
|
22 |
return df, response
|
23 |
except Exception as e:
|
24 |
return None, f"Error: {str(e)}"
|
25 |
|
26 |
+
|
27 |
pdf_examples = [
|
28 |
+
["examples/flexpocket.pdf"],
|
29 |
+
["examples/ASICS_Catalog.pdf"],
|
30 |
]
|
31 |
|
32 |
demo = gr.Interface(
|
|
|
35 |
outputs=["json", "text"],
|
36 |
examples=pdf_examples,
|
37 |
title="Open Source PDF Catalog Parser",
|
38 |
+
description="Efficient PDF catalog processing using PyMuPDF and OpenLLM",
|
39 |
+
article="Uses MinerU for layout analysis and DeepSeek-7B for structured extraction"
|
40 |
)
|
41 |
|
42 |
if __name__ == "__main__":
|
43 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)
|
examples/ASICS_Catalog.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6263bd2732f674657e22e83deb21d3e34afedc028834e078e28c73c0a305d4b3
|
3 |
+
size 392487
|
examples/M8_Private_Sale_p2.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ebc4730fd7dc87baf2f67c81934d9d3162f662453a7e8d50190b7018058ac1f
|
3 |
+
size 565084
|
examples/flexpocket.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c89cb2cdc331adc6bef8814bf69a02b49acb3d7695da404c80c54910c81b947c
|
3 |
+
size 148538
|
main.py
CHANGED
@@ -36,21 +36,36 @@ class ProductSpec:
|
|
36 |
|
37 |
class PDFProcessor:
|
38 |
def __init__(self):
|
39 |
-
self.emb_model =
|
40 |
-
self.llm = self._initialize_llm()
|
41 |
self.output_dir = Path("./output")
|
42 |
self.output_dir.mkdir(exist_ok=True)
|
43 |
-
|
44 |
-
def
|
|
|
|
|
|
|
|
|
|
|
45 |
"""Initialize LLM with automatic download if needed"""
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def process_pdf(self, pdf_path: str) -> Dict:
|
56 |
"""Process PDF using MinerU pipeline"""
|
|
|
36 |
|
37 |
class PDFProcessor:
|
38 |
def __init__(self):
|
39 |
+
self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
|
40 |
+
self.llm = self._initialize_llm("deepseek-llm-7b-base.Q5_K_M.gguf")
|
41 |
self.output_dir = Path("./output")
|
42 |
self.output_dir.mkdir(exist_ok=True)
|
43 |
+
|
44 |
+
def _initialize_emb_model(self, model_name):
|
45 |
+
model = SentenceTransformer("sentence-transformers/" + model_name)
|
46 |
+
model.save('models/'+ model_name)
|
47 |
+
return model
|
48 |
+
|
49 |
+
def _initialize_llm(self, model_name):
|
50 |
"""Initialize LLM with automatic download if needed"""
|
51 |
+
model_path = os.path.join("models/", model_name)
|
52 |
+
if os.path.exists(model_path):
|
53 |
+
return Llama(
|
54 |
+
model_path=model_path,
|
55 |
+
n_ctx=4096,
|
56 |
+
n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
|
57 |
+
n_threads=os.cpu_count() - 1,
|
58 |
+
verbose=False
|
59 |
+
)
|
60 |
+
else:
|
61 |
+
return Llama.from_pretrained(
|
62 |
+
repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
|
63 |
+
filename=model_name,
|
64 |
+
n_ctx=4096,
|
65 |
+
n_threads=os.cpu_count() - 1,
|
66 |
+
n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
|
67 |
+
verbose=False
|
68 |
+
)
|
69 |
|
70 |
def process_pdf(self, pdf_path: str) -> Dict:
|
71 |
"""Process PDF using MinerU pipeline"""
|
requirements.txt
CHANGED
@@ -5,7 +5,6 @@ llama-cpp-python[huggingface-hub]
|
|
5 |
magic-pdf
|
6 |
paddleocr
|
7 |
paddlepaddle
|
8 |
-
torch
|
9 |
git+https://github.com/opendatalab/MinerU.git
|
10 |
omegaconf
|
11 |
matplotlib
|
@@ -14,8 +13,8 @@ ultralytics
|
|
14 |
doclayout-yolo
|
15 |
unimernet
|
16 |
struct-eqtable
|
17 |
-
lmdeploy
|
18 |
tf-keras
|
19 |
pycocotools
|
20 |
git+https://github.com/facebookresearch/detectron2.git
|
21 |
-
rapid-table
|
|
|
5 |
magic-pdf
|
6 |
paddleocr
|
7 |
paddlepaddle
|
|
|
8 |
git+https://github.com/opendatalab/MinerU.git
|
9 |
omegaconf
|
10 |
matplotlib
|
|
|
13 |
doclayout-yolo
|
14 |
unimernet
|
15 |
struct-eqtable
|
16 |
+
#lmdeploy
|
17 |
tf-keras
|
18 |
pycocotools
|
19 |
git+https://github.com/facebookresearch/detectron2.git
|
20 |
+
rapid-table
|