Upload 13 files
Browse files- app.py +142 -0
- config/appConfig.py +11 -0
- config/config.py +10 -0
- data/document_loader.py +24 -0
- data/pdf_reader.py +31 -0
- globals.py +6 -0
- requirements.txt +11 -0
- retriever/chat_manager.py +43 -0
- retriever/chunk_documents.py +49 -0
- retriever/document_manager.py +113 -0
- retriever/llm_manager.py +116 -0
- retriever/vector_store_manager.py +90 -0
- utils/document_utils.py +56 -0
app.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import gradio as gr
|
3 |
+
from utils.document_utils import initialize_logging
|
4 |
+
from retriever.chat_manager import chat_response
|
5 |
+
# Note: DocumentManager is now initialized in retrieve_documents.py
|
6 |
+
from globals import app_config
|
7 |
+
|
8 |
+
# Configure logging
|
9 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
+
initialize_logging()
|
11 |
+
|
12 |
+
def load_sample_question(question):
|
13 |
+
return question
|
14 |
+
|
15 |
+
def clear_selection():
|
16 |
+
return [], "", [] # Reset doc_selector to empty list
|
17 |
+
|
18 |
+
def process_uploaded_file(file, current_selection):
|
19 |
+
"""Process uploaded file using DocumentManager and update UI."""
|
20 |
+
status, page_list, filename, _ = app_config.doc_manager.process_document(file.name if file else None)
|
21 |
+
|
22 |
+
# Update current selection to include new file if not already present
|
23 |
+
updated_selection = current_selection if current_selection else []
|
24 |
+
if filename and filename not in updated_selection:
|
25 |
+
updated_selection.append(filename)
|
26 |
+
|
27 |
+
return (
|
28 |
+
status,
|
29 |
+
page_list,
|
30 |
+
gr.update(choices=app_config.doc_manager.get_uploaded_documents(), value=updated_selection)
|
31 |
+
)
|
32 |
+
|
33 |
+
def update_doc_selector(selected_docs):
|
34 |
+
"""Keep selected documents in sync."""
|
35 |
+
return selected_docs
|
36 |
+
|
37 |
+
# UI Configuration
|
38 |
+
models = ["gemma2-9b-it", "llama-guard-3-8b", "qwen-2.5-32b"]
|
39 |
+
example_questions = [
|
40 |
+
"What is communication server?",
|
41 |
+
"Show me an example of a configuration file.",
|
42 |
+
"How to create Protected File Directories ?",
|
43 |
+
"What are the attributes of the Azureblobstorage port?",
|
44 |
+
"What is Mediator help?",
|
45 |
+
]
|
46 |
+
all_questions = [
|
47 |
+
"Can you explain Communication Server architecture?",
|
48 |
+
"Why does the other instance of my multi-instance qmgr seem to hang after a failover? Queue manager will not start after failover.",
|
49 |
+
"Explain the concept of blockchain.",
|
50 |
+
"What is the capital of France?",
|
51 |
+
"Do Surface Porosity and Pore Size Influence Mechanical Properties and Cellular Response to PEEK?",
|
52 |
+
"How does a vaccine work?",
|
53 |
+
"Tell me the step-by-step instruction for front-door installation.",
|
54 |
+
"What are the risk factors for heart disease?",
|
55 |
+
]
|
56 |
+
|
57 |
+
with gr.Blocks() as interface:
|
58 |
+
interface.title = "🤖 IntelliDoc: AI Document Explorer"
|
59 |
+
gr.Markdown("""
|
60 |
+
# 🤖 IntelliDoc: AI Document Explorer
|
61 |
+
**AI Document Explorer** allows you to upload PDF documents and interact with them using AI-powered analysis and summarization. Ask questions, extract key insights, and gain a deeper understanding of your documents effortlessly.
|
62 |
+
""")
|
63 |
+
with gr.Row():
|
64 |
+
# Left Sidebar
|
65 |
+
with gr.Column(scale=2):
|
66 |
+
gr.Markdown("## Upload and Select Document")
|
67 |
+
upload_btn = gr.File(label="Upload PDF Document", file_types=[".pdf"])
|
68 |
+
doc_selector = gr.Dropdown(
|
69 |
+
choices=app_config.doc_manager.get_uploaded_documents(),
|
70 |
+
label="Documents",
|
71 |
+
multiselect=True,
|
72 |
+
value=[] # Initial value as empty list
|
73 |
+
)
|
74 |
+
model_selector = gr.Dropdown(choices=models, label="Models", interactive=True)
|
75 |
+
clear_btn = gr.Button("Clear Selection")
|
76 |
+
upload_status = gr.Textbox(label="Upload Status", interactive=False)
|
77 |
+
|
78 |
+
# Process uploaded file and update UI
|
79 |
+
upload_btn.change(
|
80 |
+
process_uploaded_file,
|
81 |
+
inputs=[upload_btn, doc_selector],
|
82 |
+
outputs=[
|
83 |
+
upload_status,
|
84 |
+
gr.State(), # page_list
|
85 |
+
doc_selector # Update choices and value together
|
86 |
+
]
|
87 |
+
)
|
88 |
+
clear_btn.click(
|
89 |
+
clear_selection,
|
90 |
+
outputs=[doc_selector, upload_status, gr.State()]
|
91 |
+
)
|
92 |
+
# Reinitialize LLM when the model changes
|
93 |
+
model_selector.change(
|
94 |
+
app_config.gen_llm.reinitialize_llm,
|
95 |
+
inputs=[model_selector],
|
96 |
+
outputs=[upload_status]
|
97 |
+
)
|
98 |
+
|
99 |
+
# Middle Section (Chat & LLM Response)
|
100 |
+
with gr.Column(scale=6):
|
101 |
+
gr.Markdown("## Chat with document(s)")
|
102 |
+
chat_history = gr.Textbox(label="Chat History", interactive=False, lines=26, elem_id="chat-history", elem_classes=["chat-box"])
|
103 |
+
with gr.Row():
|
104 |
+
chat_input = gr.Textbox(label="Ask additional questions about the document...", show_label=False, placeholder="Ask additional questions about the document...", elem_id="chat-input", lines=3)
|
105 |
+
chat_btn = gr.Button("🚀 Send", variant="primary", elem_id="send-button", scale=0)
|
106 |
+
chat_btn.click(chat_response, inputs=[chat_input, doc_selector, chat_history], outputs=chat_history).then(
|
107 |
+
lambda: "", # Return an empty string to clear the chat_input
|
108 |
+
outputs=chat_input
|
109 |
+
)
|
110 |
+
|
111 |
+
# Right Sidebar (Sample Questions & History)
|
112 |
+
with gr.Column(scale=2):
|
113 |
+
gr.Markdown("## Frequently asked questions:")
|
114 |
+
with gr.Column():
|
115 |
+
gr.Examples(
|
116 |
+
examples=example_questions,
|
117 |
+
inputs=chat_input,
|
118 |
+
label=""
|
119 |
+
)
|
120 |
+
question_dropdown = gr.Dropdown(
|
121 |
+
label="",
|
122 |
+
choices=all_questions,
|
123 |
+
interactive=True,
|
124 |
+
info="Choose a question from the dropdown to populate the query box."
|
125 |
+
)
|
126 |
+
|
127 |
+
gr.Markdown("## Logs")
|
128 |
+
history = gr.Textbox(label="Previous Queries", interactive=False)
|
129 |
+
|
130 |
+
gr.HTML("""
|
131 |
+
<style>
|
132 |
+
.chat-box textarea {
|
133 |
+
max-height: 600px !important;
|
134 |
+
overflow-y: auto !important;
|
135 |
+
resize: vertical;
|
136 |
+
white-space: pre-wrap; /* Keeps formatting */
|
137 |
+
}
|
138 |
+
</style>
|
139 |
+
""")
|
140 |
+
|
141 |
+
if __name__ == "__main__":
|
142 |
+
interface.launch()
|
config/appConfig.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from retriever.llm_manager import LLMManager
|
3 |
+
from retriever.document_manager import DocumentManager
|
4 |
+
|
5 |
+
class AppConfig:
|
6 |
+
def __init__(self):
|
7 |
+
# Initialize LLMManager with the default model
|
8 |
+
self.gen_llm = LLMManager() # This will initialize the default model ("gemma2-9b-it")
|
9 |
+
# Initialize DocumentManager (it will be a singleton instance shared across the app)
|
10 |
+
self.doc_manager = DocumentManager()
|
11 |
+
logging.info("AppConfig initialized with LLMManager")
|
config/config.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
class ConfigConstants:
|
3 |
+
# Constants related to datasets and models
|
4 |
+
DATA_SET_PATH= '/persistent/'
|
5 |
+
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
|
6 |
+
RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-electra-base'
|
7 |
+
GENERATION_MODEL_NAME = 'mixtral-8x7b-32768'
|
8 |
+
GENERATION_MODELS = ["llama3-8b-8192", "qwen-2.5-32b", "mixtral-8x7b-32768", "gemma2-9b-it" ]
|
9 |
+
DEFAULT_CHUNK_SIZE = 1000
|
10 |
+
CHUNK_OVERLAP = 200
|
data/document_loader.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# document_loader.py
|
2 |
+
import os
|
3 |
+
from typing import Optional
|
4 |
+
|
5 |
+
class DocumentLoader:
|
6 |
+
def __init__(self):
|
7 |
+
self.uploaded_file = None
|
8 |
+
|
9 |
+
def load_file(self, file_path: str) -> Optional[str]:
|
10 |
+
"""
|
11 |
+
Load the uploaded PDF file and validate it
|
12 |
+
Returns the file path if valid, None otherwise
|
13 |
+
"""
|
14 |
+
if not file_path:
|
15 |
+
return None
|
16 |
+
|
17 |
+
if not file_path.lower().endswith('.pdf'):
|
18 |
+
raise ValueError("Only PDF files are supported")
|
19 |
+
|
20 |
+
if not os.path.exists(file_path):
|
21 |
+
raise FileNotFoundError("File does not exist")
|
22 |
+
|
23 |
+
self.uploaded_file = file_path
|
24 |
+
return file_path
|
data/pdf_reader.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pdf_reader.py
|
2 |
+
import PyPDF2
|
3 |
+
from typing import List
|
4 |
+
|
5 |
+
class PDFReader:
|
6 |
+
def __init__(self):
|
7 |
+
self.page_list = []
|
8 |
+
|
9 |
+
def read_pdf(self, file_path: str) -> List[str]:
|
10 |
+
"""
|
11 |
+
Read PDF content and return list of pages
|
12 |
+
Each element in the list is the text content of a page
|
13 |
+
"""
|
14 |
+
try:
|
15 |
+
# Open and read the PDF file
|
16 |
+
with open(file_path, 'rb') as file:
|
17 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
18 |
+
num_pages = len(pdf_reader.pages)
|
19 |
+
|
20 |
+
# Extract text from each page
|
21 |
+
self.page_list = []
|
22 |
+
for page_num in range(num_pages):
|
23 |
+
page = pdf_reader.pages[page_num]
|
24 |
+
text = page.extract_text()
|
25 |
+
if text: # Only add non-empty pages
|
26 |
+
self.page_list.append(text.strip())
|
27 |
+
|
28 |
+
return self.page_list
|
29 |
+
|
30 |
+
except Exception as e:
|
31 |
+
raise Exception(f"Error reading PDF: {str(e)}")
|
globals.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from config.appConfig import AppConfig
|
3 |
+
|
4 |
+
# Initialize AppConfig (this will initialize the LLMManager with the default model)
|
5 |
+
app_config = AppConfig()
|
6 |
+
logging.info("Global app_config initialized")
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
torch
|
3 |
+
faiss-cpu
|
4 |
+
sentence-transformers
|
5 |
+
langchain
|
6 |
+
llama-index
|
7 |
+
langchain-community
|
8 |
+
langchain_groq
|
9 |
+
langchain-huggingface
|
10 |
+
gradio
|
11 |
+
PyPDF2
|
retriever/chat_manager.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import List
|
3 |
+
from globals import app_config
|
4 |
+
|
5 |
+
def chat_response(query: str, selected_docs: List[str], history: str) -> str:
|
6 |
+
"""
|
7 |
+
Generate a chat response based on the user's query and selected documents.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
query (str): The user's query.
|
11 |
+
selected_docs (List[str]): List of selected document filenames from the dropdown.
|
12 |
+
history (str): The chat history.
|
13 |
+
model_name (str): The name of the LLM model to use for generation.
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
str: Updated chat history with the new response.
|
17 |
+
"""
|
18 |
+
if not query:
|
19 |
+
return history + "\n" + "Response: Please enter a query." if history else "Response: Please enter a query."
|
20 |
+
|
21 |
+
if not selected_docs:
|
22 |
+
return history + "\n" + "LLM: Please select at least one document." if history else "Response: Please select at least one document."
|
23 |
+
|
24 |
+
# Retrieve the top 5 chunks based on the query and selected documents
|
25 |
+
top_k_results = app_config.doc_manager.retrieve_top_k(query, selected_docs, k=5)
|
26 |
+
|
27 |
+
if not top_k_results:
|
28 |
+
return history + "\n" + f"User: {query}\nLLM: No relevant information found in the selected documents." if history else f"User: {query}\nLLM: No relevant information found in the selected documents."
|
29 |
+
|
30 |
+
# Send the top K results to the LLM to generate a response
|
31 |
+
try:
|
32 |
+
llm_response, source_docs = app_config.gen_llm.generate_response(query, top_k_results)
|
33 |
+
except Exception as e:
|
34 |
+
return history + "\n" + f"User: {query}\nLLM: Error generating response: {str(e)}" if history else f"User: {query}\nLLM: Error generating response: {str(e)}"
|
35 |
+
|
36 |
+
# Format the response for the chat history
|
37 |
+
response = f"{llm_response}\n"
|
38 |
+
'''for i, doc in enumerate(source_docs, 1):
|
39 |
+
doc_id = doc.metadata.get('doc_id', 'Unknown')
|
40 |
+
filename = next((name for name, d_id in app_config.doc_manager.document_ids.items() if d_id == doc_id), 'Unknown')
|
41 |
+
response += f"{i}. {filename}: {doc.page_content[:100]}...\n"'''
|
42 |
+
|
43 |
+
return history + "\n" + f"User: {query}\nResponse: {response}" if history else f"User: {query}\nResponse: {response}"
|
retriever/chunk_documents.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
3 |
+
import hashlib
|
4 |
+
|
5 |
+
def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
|
6 |
+
"""
|
7 |
+
Chunk a list of page contents into smaller segments with document ID metadata.
|
8 |
+
|
9 |
+
Args:
|
10 |
+
page_list (list): List of strings, each string being the content of a page.
|
11 |
+
doc_id (str): Unique identifier for the document.
|
12 |
+
chunk_size (int): Maximum size of each chunk (default: 1000 characters).
|
13 |
+
chunk_overlap (int): Overlap between chunks (default: 200 characters).
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.
|
17 |
+
"""
|
18 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
|
19 |
+
documents = []
|
20 |
+
seen_hashes = set() # Track hashes of chunks to avoid duplicates
|
21 |
+
|
22 |
+
for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1
|
23 |
+
if not page_content or not isinstance(page_content, str):
|
24 |
+
continue # Skip empty or invalid pages
|
25 |
+
|
26 |
+
# Split the page content into chunks
|
27 |
+
chunks = text_splitter.split_text(page_content)
|
28 |
+
|
29 |
+
for i, chunk in enumerate(chunks):
|
30 |
+
# Generate a unique hash for the chunk
|
31 |
+
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
|
32 |
+
|
33 |
+
# Skip if the chunk is a duplicate
|
34 |
+
if chunk_hash in seen_hashes:
|
35 |
+
continue
|
36 |
+
|
37 |
+
# Create source identifier (e.g., "doc_123_page_1_chunk_0")
|
38 |
+
source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"
|
39 |
+
|
40 |
+
# Add the chunk with doc_id as metadata
|
41 |
+
documents.append({
|
42 |
+
'text': chunk,
|
43 |
+
'source': source,
|
44 |
+
'doc_id': doc_id
|
45 |
+
})
|
46 |
+
seen_hashes.add(chunk_hash)
|
47 |
+
|
48 |
+
logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
|
49 |
+
return documents
|
retriever/document_manager.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import Any, Dict, List
|
4 |
+
import uuid
|
5 |
+
from data.document_loader import DocumentLoader
|
6 |
+
from data.pdf_reader import PDFReader
|
7 |
+
from retriever.chunk_documents import chunk_documents
|
8 |
+
from retriever.vector_store_manager import VectorStoreManager
|
9 |
+
|
10 |
+
class DocumentManager:
|
11 |
+
def __init__(self):
|
12 |
+
self.doc_loader = DocumentLoader()
|
13 |
+
self.pdf_reader = PDFReader()
|
14 |
+
self.vector_manager = VectorStoreManager()
|
15 |
+
self.uploaded_documents = {}
|
16 |
+
self.chunked_documents = {}
|
17 |
+
self.document_ids = {}
|
18 |
+
logging.info("DocumentManager initialized")
|
19 |
+
|
20 |
+
def process_document(self, file):
|
21 |
+
"""
|
22 |
+
Process an uploaded file: load, read PDF, chunk, and store in vector store.
|
23 |
+
Returns: (status_message, page_list, filename, doc_id)
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
if file is None:
|
27 |
+
return "No file uploaded", [], None, None
|
28 |
+
|
29 |
+
logging.info(f"Processing file: {file}")
|
30 |
+
|
31 |
+
# Load and validate file
|
32 |
+
file_path = self.doc_loader.load_file(file)
|
33 |
+
filename = os.path.basename(file_path)
|
34 |
+
|
35 |
+
# Read PDF content
|
36 |
+
page_list = self.pdf_reader.read_pdf(file_path)
|
37 |
+
|
38 |
+
# Store the uploaded document
|
39 |
+
self.uploaded_documents[filename] = file_path
|
40 |
+
|
41 |
+
# Generate a unique document ID
|
42 |
+
doc_id = str(uuid.uuid4())
|
43 |
+
self.document_ids[filename] = doc_id
|
44 |
+
|
45 |
+
# Chunk the pages
|
46 |
+
chunks = chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200)
|
47 |
+
self.chunked_documents[filename] = chunks
|
48 |
+
|
49 |
+
# Add chunks to vector store
|
50 |
+
self.vector_manager.add_documents(chunks)
|
51 |
+
|
52 |
+
return (
|
53 |
+
f"Successfully loaded {filename} with {len(page_list)} pages",
|
54 |
+
page_list,
|
55 |
+
filename,
|
56 |
+
doc_id
|
57 |
+
)
|
58 |
+
|
59 |
+
except Exception as e:
|
60 |
+
logging.error(f"Error processing document: {str(e)}")
|
61 |
+
return f"Error: {str(e)}", [], None, None
|
62 |
+
|
63 |
+
def get_uploaded_documents(self):
|
64 |
+
"""Return the list of uploaded document filenames."""
|
65 |
+
return list(self.uploaded_documents.keys())
|
66 |
+
|
67 |
+
def get_chunks(self, filename):
|
68 |
+
"""Return chunks for a given filename."""
|
69 |
+
return self.chunked_documents.get(filename, [])
|
70 |
+
|
71 |
+
def get_document_id(self, filename):
|
72 |
+
"""Return the document ID for a given filename."""
|
73 |
+
return self.document_ids.get(filename, None)
|
74 |
+
|
75 |
+
def retrieve_top_k(self, query: str, selected_docs: List[str], k: int = 5) -> List[Dict[str, Any]]:
|
76 |
+
"""
|
77 |
+
Retrieve the top K chunks across the selected documents based on the user's query.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
query (str): The user's query.
|
81 |
+
selected_docs (List[str]): List of selected document filenames from the dropdown.
|
82 |
+
k (int): Number of top results to return (default is 5).
|
83 |
+
|
84 |
+
Returns:
|
85 |
+
List[Dict[str, Any]]: List of top K chunks with their text, metadata, and scores.
|
86 |
+
"""
|
87 |
+
if not selected_docs:
|
88 |
+
logging.warning("No documents selected for retrieval")
|
89 |
+
return []
|
90 |
+
|
91 |
+
all_results = []
|
92 |
+
for filename in selected_docs:
|
93 |
+
doc_id = self.get_document_id(filename)
|
94 |
+
if not doc_id:
|
95 |
+
logging.warning(f"No document ID found for filename: {filename}")
|
96 |
+
continue
|
97 |
+
|
98 |
+
# Search for relevant chunks within this document
|
99 |
+
results = self.vector_manager.search(query, doc_id, k=k)
|
100 |
+
all_results.extend(results)
|
101 |
+
|
102 |
+
# Sort all results by score in descending order and take the top K
|
103 |
+
all_results.sort(key=lambda x: x['score'], reverse=True)
|
104 |
+
top_k_results = all_results[:k]
|
105 |
+
|
106 |
+
# Log the list of retrieved documents
|
107 |
+
logging.info("Retrieved top K documents:")
|
108 |
+
for i, result in enumerate(top_k_results, 1):
|
109 |
+
doc_id = result['metadata'].get('doc_id', 'Unknown')
|
110 |
+
filename = next((name for name, d_id in self.document_ids.items() if d_id == doc_id), 'Unknown')
|
111 |
+
logging.info(f"{i}. Filename: {filename}, Doc ID: {doc_id}, Score: {result['score']:.4f}, Text: {result['text'][:100]}...")
|
112 |
+
|
113 |
+
return top_k_results
|
retriever/llm_manager.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import List, Dict, Any, Tuple
|
4 |
+
from langchain_groq import ChatGroq
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from langchain_core.documents import Document
|
7 |
+
from langchain_core.retrievers import BaseRetriever
|
8 |
+
|
9 |
+
class LLMManager:
|
10 |
+
DEFAULT_MODEL = "gemma2-9b-it" # Set the default model name
|
11 |
+
|
12 |
+
def __init__(self):
|
13 |
+
self.generation_llm = None
|
14 |
+
logging.info("LLMManager initialized")
|
15 |
+
|
16 |
+
# Initialize the default model during construction
|
17 |
+
try:
|
18 |
+
self.initialize_generation_llm(self.DEFAULT_MODEL)
|
19 |
+
logging.info(f"Initialized default LLM model: {self.DEFAULT_MODEL}")
|
20 |
+
except ValueError as e:
|
21 |
+
logging.error(f"Failed to initialize default LLM model: {str(e)}")
|
22 |
+
|
23 |
+
def initialize_generation_llm(self, model_name: str) -> None:
|
24 |
+
"""
|
25 |
+
Initialize the generation LLM using the Groq API.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
model_name (str): The name of the model to use for generation.
|
29 |
+
|
30 |
+
Raises:
|
31 |
+
ValueError: If GROQ_API_KEY is not set.
|
32 |
+
"""
|
33 |
+
api_key = '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' #os.getenv("GROQ_API_KEY")
|
34 |
+
if not api_key:
|
35 |
+
raise ValueError("GROQ_API_KEY is not set. Please add it in your environment variables.")
|
36 |
+
|
37 |
+
os.environ["GROQ_API_KEY"] = api_key
|
38 |
+
self.generation_llm = ChatGroq(model=model_name, temperature=0.7)
|
39 |
+
self.generation_llm.name = model_name
|
40 |
+
logging.info(f"Generation LLM {model_name} initialized")
|
41 |
+
|
42 |
+
def reinitialize_llm(self, model_name: str) -> str:
|
43 |
+
"""
|
44 |
+
Reinitialize the LLM with a new model name.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
model_name (str): The name of the new model to initialize.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
str: Status message indicating success or failure.
|
51 |
+
"""
|
52 |
+
try:
|
53 |
+
self.initialize_generation_llm(model_name)
|
54 |
+
return f"LLM model changed to {model_name}"
|
55 |
+
except ValueError as e:
|
56 |
+
logging.error(f"Failed to reinitialize LLM with model {model_name}: {str(e)}")
|
57 |
+
return f"Error: Failed to change LLM model: {str(e)}"
|
58 |
+
|
59 |
+
def generate_response(self, question: str, relevant_docs: List[Dict[str, Any]]) -> Tuple[str, List[Document]]:
|
60 |
+
"""
|
61 |
+
Generate a response using the generation LLM based on the question and relevant documents.
|
62 |
+
|
63 |
+
Args:
|
64 |
+
question (str): The user's query.
|
65 |
+
relevant_docs (List[Dict[str, Any]]): List of relevant document chunks with text, metadata, and scores.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
Tuple[str, List[Document]]: The LLM's response and the source documents used.
|
69 |
+
|
70 |
+
Raises:
|
71 |
+
ValueError: If the generation LLM is not initialized.
|
72 |
+
Exception: If there's an error during the QA chain invocation.
|
73 |
+
"""
|
74 |
+
if not self.generation_llm:
|
75 |
+
raise ValueError("Generation LLM is not initialized. Call initialize_generation_llm first.")
|
76 |
+
|
77 |
+
# Convert the relevant documents into LangChain Document objects
|
78 |
+
documents = [
|
79 |
+
Document(page_content=doc['text'], metadata=doc['metadata'])
|
80 |
+
for doc in relevant_docs
|
81 |
+
]
|
82 |
+
|
83 |
+
# Create a proper retriever by subclassing BaseRetriever
|
84 |
+
class SimpleRetriever(BaseRetriever):
|
85 |
+
def __init__(self, docs: List[Document], **kwargs):
|
86 |
+
super().__init__(**kwargs) # Pass kwargs to BaseRetriever
|
87 |
+
self._docs = docs # Use a private attribute to store docs
|
88 |
+
logging.debug(f"SimpleRetriever initialized with {len(docs)} documents")
|
89 |
+
|
90 |
+
def _get_relevant_documents(self, query: str) -> List[Document]:
|
91 |
+
logging.debug(f"SimpleRetriever._get_relevant_documents called with query: {query}")
|
92 |
+
return self._docs
|
93 |
+
|
94 |
+
async def _aget_relevant_documents(self, query: str) -> List[Document]:
|
95 |
+
logging.debug(f"SimpleRetriever._aget_relevant_documents called with query: {query}")
|
96 |
+
return self._docs
|
97 |
+
|
98 |
+
# Instantiate the retriever
|
99 |
+
retriever = SimpleRetriever(docs=documents)
|
100 |
+
|
101 |
+
# Create a retrieval-based question-answering chain
|
102 |
+
qa_chain = RetrievalQA.from_chain_type(
|
103 |
+
llm=self.generation_llm,
|
104 |
+
retriever=retriever,
|
105 |
+
return_source_documents=True
|
106 |
+
)
|
107 |
+
|
108 |
+
try:
|
109 |
+
result = qa_chain.invoke({"query": question})
|
110 |
+
response = result['result']
|
111 |
+
source_docs = result['source_documents']
|
112 |
+
logging.info(f"Generated response for question: {question} : {response}")
|
113 |
+
return response, source_docs
|
114 |
+
except Exception as e:
|
115 |
+
logging.error(f"Error during QA chain invocation: {str(e)}")
|
116 |
+
raise e
|
retriever/vector_store_manager.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from config.config import ConfigConstants
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
from langchain_community.vectorstores import FAISS
|
6 |
+
|
7 |
+
class VectorStoreManager:
|
8 |
+
def __init__(self, embedding_path="embeddings.faiss"):
|
9 |
+
"""
|
10 |
+
Initialize the vector store manager.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
embedding_path (str): Path to save/load the FAISS index.
|
14 |
+
"""
|
15 |
+
self.embedding_path = embedding_path
|
16 |
+
self.embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
|
17 |
+
self.vector_store = self._initialize_vector_store()
|
18 |
+
|
19 |
+
def _initialize_vector_store(self):
|
20 |
+
"""Initialize or load the FAISS vector store."""
|
21 |
+
if os.path.exists(self.embedding_path):
|
22 |
+
logging.info("Loading embeddings from local file")
|
23 |
+
return FAISS.load_local(
|
24 |
+
self.embedding_path,
|
25 |
+
self.embedding_model,
|
26 |
+
allow_dangerous_deserialization=True
|
27 |
+
)
|
28 |
+
else:
|
29 |
+
logging.info("Creating new vector store")
|
30 |
+
# Return an empty vector store; it will be populated when documents are added
|
31 |
+
return FAISS.from_texts(
|
32 |
+
texts=[""], # Dummy text to initialize
|
33 |
+
embedding=self.embedding_model,
|
34 |
+
metadatas=[{"source": "init", "doc_id": "init"}]
|
35 |
+
)
|
36 |
+
|
37 |
+
def add_documents(self, documents):
|
38 |
+
"""
|
39 |
+
Add new documents to the vector store and save it.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
documents (list): List of dictionaries with 'text', 'source', and 'doc_id'.
|
43 |
+
"""
|
44 |
+
if not documents:
|
45 |
+
return
|
46 |
+
|
47 |
+
texts = [doc['text'] for doc in documents]
|
48 |
+
metadatas = [{'source': doc['source'], 'doc_id': doc['doc_id']} for doc in documents]
|
49 |
+
|
50 |
+
logging.info("Adding new documents to vector store")
|
51 |
+
self.vector_store.add_texts(
|
52 |
+
texts=texts,
|
53 |
+
metadatas=metadatas
|
54 |
+
)
|
55 |
+
self.vector_store.save_local(self.embedding_path)
|
56 |
+
logging.info(f"Vector store updated and saved to {self.embedding_path}")
|
57 |
+
|
58 |
+
def search(self, query, doc_id, k=4):
|
59 |
+
"""
|
60 |
+
Search the vector store for relevant chunks, filtered by doc_id.
|
61 |
+
|
62 |
+
Args:
|
63 |
+
query (str): The user's query.
|
64 |
+
doc_id (str): The document ID to filter by.
|
65 |
+
k (int): Number of results to return.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
list: List of relevant document chunks with metadata and scores.
|
69 |
+
"""
|
70 |
+
if not self.vector_store:
|
71 |
+
return []
|
72 |
+
|
73 |
+
try:
|
74 |
+
|
75 |
+
# Define a filter function to match doc_id
|
76 |
+
filter_fn = lambda metadata: metadata['doc_id'] == doc_id
|
77 |
+
|
78 |
+
# Perform similarity search with filter
|
79 |
+
results = self.vector_store.similarity_search_with_score(
|
80 |
+
query=query,
|
81 |
+
k=k,
|
82 |
+
filter=filter_fn
|
83 |
+
)
|
84 |
+
|
85 |
+
# Format results
|
86 |
+
return [{'text': doc.page_content, 'metadata': doc.metadata, 'score': score} for doc, score in results]
|
87 |
+
|
88 |
+
except Exception as e:
|
89 |
+
logging.error(f"Error during vector store search: {str(e)}")
|
90 |
+
return []
|
utils/document_utils.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
logs = []
|
5 |
+
class Document:
|
6 |
+
def __init__(self, metadata, page_content):
|
7 |
+
self.metadata = metadata
|
8 |
+
self.page_content = page_content
|
9 |
+
|
10 |
+
def apply_sentence_keys_documents(relevant_docs: List[Document]):
|
11 |
+
result = []
|
12 |
+
'''for i, doc in enumerate(relevant_docs):
|
13 |
+
doc_id = str(i)
|
14 |
+
title_passage = doc.page_content.split('\nPassage: ')
|
15 |
+
title = title_passage[0]
|
16 |
+
passages = title_passage[1].split('. ')
|
17 |
+
|
18 |
+
doc_result = []
|
19 |
+
doc_result.append([f"{doc_id}a", title])
|
20 |
+
|
21 |
+
for j, passage in enumerate(passages):
|
22 |
+
doc_result.append([f"{doc_id}{chr(98 + j)}", passage])
|
23 |
+
|
24 |
+
result.append(doc_result)'''
|
25 |
+
|
26 |
+
for relevant_doc_index, relevant_doc in enumerate(relevant_docs):
|
27 |
+
sentences = []
|
28 |
+
for sentence_index, sentence in enumerate(relevant_doc.page_content.split(".")):
|
29 |
+
sentences.append([str(relevant_doc_index)+chr(97 + sentence_index), sentence])
|
30 |
+
result.append(sentences)
|
31 |
+
|
32 |
+
return result
|
33 |
+
|
34 |
+
def apply_sentence_keys_response(input_string):
|
35 |
+
sentences = input_string.split('. ')
|
36 |
+
result = [[chr(97 + i), sentence] for i, sentence in enumerate(sentences)]
|
37 |
+
return result
|
38 |
+
|
39 |
+
def initialize_logging():
|
40 |
+
logger = logging.getLogger()
|
41 |
+
logger.setLevel(logging.INFO)
|
42 |
+
|
43 |
+
# Custom log handler to capture logs and add them to the logs list
|
44 |
+
class LogHandler(logging.Handler):
|
45 |
+
def emit(self, record):
|
46 |
+
log_entry = self.format(record)
|
47 |
+
logs.append(log_entry)
|
48 |
+
|
49 |
+
# Add custom log handler to the logger
|
50 |
+
log_handler = LogHandler()
|
51 |
+
log_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
|
52 |
+
logger.addHandler(log_handler)
|
53 |
+
|
54 |
+
def get_logs():
|
55 |
+
"""Retrieve logs for display."""
|
56 |
+
return "\n".join(logs[-100:]) # Only show the last 50 logs for example
|