tensorboy0101 commited on
Commit
69ffbc4
·
verified ·
1 Parent(s): 19b5668

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -6
app.py CHANGED
@@ -9,6 +9,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
9
  from langchain.chains.question_answering import load_qa_chain
10
  from langchain.prompts import PromptTemplate
11
  from dotenv import load_dotenv
 
 
 
 
 
12
 
13
  load_dotenv()
14
  os.getenv("GOOGLE_API_KEY")
@@ -17,12 +22,27 @@ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
17
 
18
  def get_pdf_text(pdf_docs):
19
  text = ""
20
- for pdf in pdf_docs:
21
- pdf_reader = PdfReader(pdf)
22
- for page in pdf_reader.pages:
23
- page_text = page.extract_text()
24
- if page_text:
25
- text += page_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  return text
27
 
28
 
 
9
  from langchain.chains.question_answering import load_qa_chain
10
  from langchain.prompts import PromptTemplate
11
  from dotenv import load_dotenv
12
+ from pdf2image import convert_from_bytes
13
+ from PIL import Image
14
+ import pytesseract
15
+ import io
16
+
17
 
18
  load_dotenv()
19
  os.getenv("GOOGLE_API_KEY")
 
22
 
23
  def get_pdf_text(pdf_docs):
24
  text = ""
25
+
26
+ for uploaded_file in pdf_docs:
27
+ if uploaded_file.name.endswith(".pdf"):
28
+ # Process actual PDF files
29
+ pdf_reader = PdfReader(uploaded_file)
30
+ for page in pdf_reader.pages:
31
+ page_text = page.extract_text()
32
+ if page_text:
33
+ text += page_text
34
+
35
+ # If no text extracted, try OCR
36
+ if not text.strip():
37
+ images = convert_from_bytes(uploaded_file.read())
38
+ for img in images:
39
+ text += pytesseract.image_to_string(img)
40
+
41
+ else:
42
+ # Process image files
43
+ image = Image.open(uploaded_file)
44
+ text += pytesseract.image_to_string(image)
45
+
46
  return text
47
 
48