|
import re |
|
from io import BytesIO |
|
from typing import Tuple, List |
|
import pickle |
|
|
|
from langchain.docstore.document import Document |
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain.vectorstores.faiss import FAISS |
|
from pypdf import PdfReader |
|
import faiss |
|
|
|
|
|
def parse_pdf(file: BytesIO, filename: str) -> Tuple[List[str], str]: |
|
pdf = PdfReader(file) |
|
output = [] |
|
for page in pdf.pages: |
|
text = page.extract_text() |
|
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) |
|
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) |
|
text = re.sub(r"\n\s*\n", "\n\n", text) |
|
output.append(text) |
|
return output, filename |
|
|
|
|
|
def text_to_docs(text: List[str], filename: str) -> List[Document]: |
|
if isinstance(text, str): |
|
text = [text] |
|
page_docs = [Document(page_content=page) for page in text] |
|
for i, doc in enumerate(page_docs): |
|
doc.metadata["page"] = i + 1 |
|
|
|
doc_chunks = [] |
|
for doc in page_docs: |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=4000, |
|
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], |
|
chunk_overlap=0, |
|
) |
|
chunks = text_splitter.split_text(doc.page_content) |
|
for i, chunk in enumerate(chunks): |
|
doc = Document( |
|
page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i} |
|
) |
|
doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}" |
|
doc.metadata["filename"] = filename |
|
doc_chunks.append(doc) |
|
return doc_chunks |
|
|
|
|
|
def docs_to_index(docs, huggingface_model_name): |
|
|
|
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") |
|
index = FAISS.from_documents(docs, embedding_model) |
|
return index |
|
|
|
|
|
def get_index_for_pdf(pdf_files, pdf_names, huggingface_model_name): |
|
documents = [] |
|
for pdf_file, pdf_name in zip(pdf_files, pdf_names): |
|
text, filename = parse_pdf(BytesIO(pdf_file), pdf_name) |
|
documents = documents + text_to_docs(text, filename) |
|
index = docs_to_index(documents, huggingface_model_name) |
|
return index |
|
|