chaaim123 commited on
Commit
88f69f2
·
verified ·
1 Parent(s): 5fc613b

Create data/pdf_reader.py

Browse files
Files changed (1) hide show
  1. data/pdf_reader.py +31 -0
data/pdf_reader.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_reader.py
2
+ import PyPDF2
3
+ from typing import List
4
+
5
+ class PDFReader:
6
+ def __init__(self):
7
+ self.page_list = []
8
+
9
+ def read_pdf(self, file_path: str) -> List[str]:
10
+ """
11
+ Read PDF content and return list of pages
12
+ Each element in the list is the text content of a page
13
+ """
14
+ try:
15
+ # Open and read the PDF file
16
+ with open(file_path, 'rb') as file:
17
+ pdf_reader = PyPDF2.PdfReader(file)
18
+ num_pages = len(pdf_reader.pages)
19
+
20
+ # Extract text from each page
21
+ self.page_list = []
22
+ for page_num in range(num_pages):
23
+ page = pdf_reader.pages[page_num]
24
+ text = page.extract_text()
25
+ if text: # Only add non-empty pages
26
+ self.page_list.append(text.strip())
27
+
28
+ return self.page_list
29
+
30
+ except Exception as e:
31
+ raise Exception(f"Error reading PDF: {str(e)}")