Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,11 @@ from langchain_google_genai import ChatGoogleGenerativeAI
|
|
9 |
from langchain.chains.question_answering import load_qa_chain
|
10 |
from langchain.prompts import PromptTemplate
|
11 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
load_dotenv()
|
14 |
os.getenv("GOOGLE_API_KEY")
|
@@ -17,12 +22,27 @@ genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
|
|
17 |
|
18 |
def get_pdf_text(pdf_docs):
|
19 |
text = ""
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
return text
|
27 |
|
28 |
|
|
|
9 |
from langchain.chains.question_answering import load_qa_chain
|
10 |
from langchain.prompts import PromptTemplate
|
11 |
from dotenv import load_dotenv
|
12 |
+
from pdf2image import convert_from_bytes
|
13 |
+
from PIL import Image
|
14 |
+
import pytesseract
|
15 |
+
import io
|
16 |
+
|
17 |
|
18 |
load_dotenv()
|
19 |
os.getenv("GOOGLE_API_KEY")
|
|
|
22 |
|
23 |
def get_pdf_text(pdf_docs):
|
24 |
text = ""
|
25 |
+
|
26 |
+
for uploaded_file in pdf_docs:
|
27 |
+
if uploaded_file.name.endswith(".pdf"):
|
28 |
+
# Process actual PDF files
|
29 |
+
pdf_reader = PdfReader(uploaded_file)
|
30 |
+
for page in pdf_reader.pages:
|
31 |
+
page_text = page.extract_text()
|
32 |
+
if page_text:
|
33 |
+
text += page_text
|
34 |
+
|
35 |
+
# If no text extracted, try OCR
|
36 |
+
if not text.strip():
|
37 |
+
images = convert_from_bytes(uploaded_file.read())
|
38 |
+
for img in images:
|
39 |
+
text += pytesseract.image_to_string(img)
|
40 |
+
|
41 |
+
else:
|
42 |
+
# Process image files
|
43 |
+
image = Image.open(uploaded_file)
|
44 |
+
text += pytesseract.image_to_string(image)
|
45 |
+
|
46 |
return text
|
47 |
|
48 |
|