minar09 commited on
Commit
4bc3210
·
verified ·
1 Parent(s): 0ebd274

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -36,3 +36,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  catalogue/ASICS_Catalog.pdf filter=lfs diff=lfs merge=lfs -text
37
  catalogue/flexpocket.pdf filter=lfs diff=lfs merge=lfs -text
38
  catalogue/M8_Private_Sale_p2.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
36
  catalogue/ASICS_Catalog.pdf filter=lfs diff=lfs merge=lfs -text
37
  catalogue/flexpocket.pdf filter=lfs diff=lfs merge=lfs -text
38
  catalogue/M8_Private_Sale_p2.pdf filter=lfs diff=lfs merge=lfs -text
39
+ examples/ASICS_Catalog.pdf filter=lfs diff=lfs merge=lfs -text
40
+ examples/flexpocket.pdf filter=lfs diff=lfs merge=lfs -text
41
+ examples/M8_Private_Sale_p2.pdf filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,25 +1,32 @@
1
  import os
2
  import gradio as gr
3
- import shutil
4
  import main
5
 
 
 
 
 
6
  def predict_from_pdf(pdf_file):
7
  upload_dir = "./catalogue/"
8
  os.makedirs(upload_dir, exist_ok=True)
9
-
10
  try:
 
11
  dest_path = os.path.join(upload_dir, pdf_file.name)
12
  with open(dest_path, "wb") as f:
13
- f.write(pdf_file.read())
14
-
15
- df, response = main_oss.process_pdf_catalog(dest_path)
 
 
16
  return df, response
17
  except Exception as e:
18
  return None, f"Error: {str(e)}"
19
 
 
20
  pdf_examples = [
21
- ["catalogue/flexpocket.pdf"],
22
- ["catalogue/ASICS_Catalog.pdf"],
23
  ]
24
 
25
  demo = gr.Interface(
@@ -28,9 +35,9 @@ demo = gr.Interface(
28
  outputs=["json", "text"],
29
  examples=pdf_examples,
30
  title="Open Source PDF Catalog Parser",
31
- description="Efficient PDF catalog processing using MinerU and OpenLLM",
32
- article="Uses MinerU for layout analysis and DeepSeek-1.3B for structured extraction"
33
  )
34
 
35
  if __name__ == "__main__":
36
- demo.queue().launch(server_name="0.0.0.0", server_port=7860)
 
1
  import os
2
  import gradio as gr
 
3
  import main
4
 
5
+ #os.environ["CUDA_VISIBLE_DEVICES"]='0'
6
+ #os.environ["USE_GPU"]="True"
7
+
8
+
9
  def predict_from_pdf(pdf_file):
10
  upload_dir = "./catalogue/"
11
  os.makedirs(upload_dir, exist_ok=True)
12
+
13
  try:
14
+ # Save the uploaded file to a temporary location
15
  dest_path = os.path.join(upload_dir, pdf_file.name)
16
  with open(dest_path, "wb") as f:
17
+ with open(pdf_file.name, "rb") as uploaded_file:
18
+ f.write(uploaded_file.read())
19
+
20
+ # Process the PDF
21
+ df, response = main.process_pdf_catalog(dest_path)
22
  return df, response
23
  except Exception as e:
24
  return None, f"Error: {str(e)}"
25
 
26
+
27
  pdf_examples = [
28
+ ["examples/flexpocket.pdf"],
29
+ ["examples/ASICS_Catalog.pdf"],
30
  ]
31
 
32
  demo = gr.Interface(
 
35
  outputs=["json", "text"],
36
  examples=pdf_examples,
37
  title="Open Source PDF Catalog Parser",
38
+ description="Efficient PDF catalog processing using PyMuPDF and OpenLLM",
39
+ article="Uses MinerU for layout analysis and DeepSeek-7B for structured extraction"
40
  )
41
 
42
  if __name__ == "__main__":
43
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860, share=True)
examples/ASICS_Catalog.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6263bd2732f674657e22e83deb21d3e34afedc028834e078e28c73c0a305d4b3
3
+ size 392487
examples/M8_Private_Sale_p2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ebc4730fd7dc87baf2f67c81934d9d3162f662453a7e8d50190b7018058ac1f
3
+ size 565084
examples/flexpocket.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c89cb2cdc331adc6bef8814bf69a02b49acb3d7695da404c80c54910c81b947c
3
+ size 148538
main.py CHANGED
@@ -36,21 +36,36 @@ class ProductSpec:
36
 
37
  class PDFProcessor:
38
  def __init__(self):
39
- self.emb_model = SentenceTransformer('all-MiniLM-L6-v2')
40
- self.llm = self._initialize_llm()
41
  self.output_dir = Path("./output")
42
  self.output_dir.mkdir(exist_ok=True)
43
-
44
- def _initialize_llm(self):
 
 
 
 
 
45
  """Initialize LLM with automatic download if needed"""
46
- return Llama.from_pretrained(
47
- repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
48
- filename="deepseek-llm-7b-base.Q5_K_M.gguf",
49
- n_ctx=2048,
50
- n_threads=os.cpu_count() - 1,
51
- n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
52
- verbose=False
53
- )
 
 
 
 
 
 
 
 
 
 
54
 
55
  def process_pdf(self, pdf_path: str) -> Dict:
56
  """Process PDF using MinerU pipeline"""
 
36
 
37
  class PDFProcessor:
38
  def __init__(self):
39
+ self.emb_model = self._initialize_emb_model("all-MiniLM-L6-v2")
40
+ self.llm = self._initialize_llm("deepseek-llm-7b-base.Q5_K_M.gguf")
41
  self.output_dir = Path("./output")
42
  self.output_dir.mkdir(exist_ok=True)
43
+
44
+ def _initialize_emb_model(self, model_name):
45
+ model = SentenceTransformer("sentence-transformers/" + model_name)
46
+ model.save('models/'+ model_name)
47
+ return model
48
+
49
+ def _initialize_llm(self, model_name):
50
  """Initialize LLM with automatic download if needed"""
51
+ model_path = os.path.join("models/", model_name)
52
+ if os.path.exists(model_path):
53
+ return Llama(
54
+ model_path=model_path,
55
+ n_ctx=4096,
56
+ n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
57
+ n_threads=os.cpu_count() - 1,
58
+ verbose=False
59
+ )
60
+ else:
61
+ return Llama.from_pretrained(
62
+ repo_id="TheBloke/deepseek-llm-7B-base-GGUF",
63
+ filename=model_name,
64
+ n_ctx=4096,
65
+ n_threads=os.cpu_count() - 1,
66
+ n_gpu_layers=35 if os.getenv('USE_GPU') else 0,
67
+ verbose=False
68
+ )
69
 
70
  def process_pdf(self, pdf_path: str) -> Dict:
71
  """Process PDF using MinerU pipeline"""
requirements.txt CHANGED
@@ -5,7 +5,6 @@ llama-cpp-python[huggingface-hub]
5
  magic-pdf
6
  paddleocr
7
  paddlepaddle
8
- torch
9
  git+https://github.com/opendatalab/MinerU.git
10
  omegaconf
11
  matplotlib
@@ -14,8 +13,8 @@ ultralytics
14
  doclayout-yolo
15
  unimernet
16
  struct-eqtable
17
- lmdeploy
18
  tf-keras
19
  pycocotools
20
  git+https://github.com/facebookresearch/detectron2.git
21
- rapid-table
 
5
  magic-pdf
6
  paddleocr
7
  paddlepaddle
 
8
  git+https://github.com/opendatalab/MinerU.git
9
  omegaconf
10
  matplotlib
 
13
  doclayout-yolo
14
  unimernet
15
  struct-eqtable
16
+ #lmdeploy
17
  tf-keras
18
  pycocotools
19
  git+https://github.com/facebookresearch/detectron2.git
20
+ rapid-table