gourisankar85 commited on
Commit
e6cc6f7
·
verified ·
1 Parent(s): a521154

Upload 13 files

Browse files
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import gradio as gr
3
+ from utils.document_utils import initialize_logging
4
+ from retriever.chat_manager import chat_response
5
+ # Note: DocumentManager is now initialized in retrieve_documents.py
6
+ from globals import app_config
7
+
8
+ # Configure logging
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+ initialize_logging()
11
+
12
+ def load_sample_question(question):
13
+ return question
14
+
15
+ def clear_selection():
16
+ return [], "", [] # Reset doc_selector to empty list
17
+
18
+ def process_uploaded_file(file, current_selection):
19
+ """Process uploaded file using DocumentManager and update UI."""
20
+ status, page_list, filename, _ = app_config.doc_manager.process_document(file.name if file else None)
21
+
22
+ # Update current selection to include new file if not already present
23
+ updated_selection = current_selection if current_selection else []
24
+ if filename and filename not in updated_selection:
25
+ updated_selection.append(filename)
26
+
27
+ return (
28
+ status,
29
+ page_list,
30
+ gr.update(choices=app_config.doc_manager.get_uploaded_documents(), value=updated_selection)
31
+ )
32
+
33
+ def update_doc_selector(selected_docs):
34
+ """Keep selected documents in sync."""
35
+ return selected_docs
36
+
37
+ # UI Configuration
38
+ models = ["gemma2-9b-it", "llama-guard-3-8b", "qwen-2.5-32b"]
39
+ example_questions = [
40
+ "What is communication server?",
41
+ "Show me an example of a configuration file.",
42
+ "How to create Protected File Directories ?",
43
+ "What are the attributes of the Azureblobstorage port?",
44
+ "What is Mediator help?",
45
+ ]
46
+ all_questions = [
47
+ "Can you explain Communication Server architecture?",
48
+ "Why does the other instance of my multi-instance qmgr seem to hang after a failover? Queue manager will not start after failover.",
49
+ "Explain the concept of blockchain.",
50
+ "What is the capital of France?",
51
+ "Do Surface Porosity and Pore Size Influence Mechanical Properties and Cellular Response to PEEK?",
52
+ "How does a vaccine work?",
53
+ "Tell me the step-by-step instruction for front-door installation.",
54
+ "What are the risk factors for heart disease?",
55
+ ]
56
+
57
+ with gr.Blocks() as interface:
58
+ interface.title = "🤖 IntelliDoc: AI Document Explorer"
59
+ gr.Markdown("""
60
+ # 🤖 IntelliDoc: AI Document Explorer
61
+ **AI Document Explorer** allows you to upload PDF documents and interact with them using AI-powered analysis and summarization. Ask questions, extract key insights, and gain a deeper understanding of your documents effortlessly.
62
+ """)
63
+ with gr.Row():
64
+ # Left Sidebar
65
+ with gr.Column(scale=2):
66
+ gr.Markdown("## Upload and Select Document")
67
+ upload_btn = gr.File(label="Upload PDF Document", file_types=[".pdf"])
68
+ doc_selector = gr.Dropdown(
69
+ choices=app_config.doc_manager.get_uploaded_documents(),
70
+ label="Documents",
71
+ multiselect=True,
72
+ value=[] # Initial value as empty list
73
+ )
74
+ model_selector = gr.Dropdown(choices=models, label="Models", interactive=True)
75
+ clear_btn = gr.Button("Clear Selection")
76
+ upload_status = gr.Textbox(label="Upload Status", interactive=False)
77
+
78
+ # Process uploaded file and update UI
79
+ upload_btn.change(
80
+ process_uploaded_file,
81
+ inputs=[upload_btn, doc_selector],
82
+ outputs=[
83
+ upload_status,
84
+ gr.State(), # page_list
85
+ doc_selector # Update choices and value together
86
+ ]
87
+ )
88
+ clear_btn.click(
89
+ clear_selection,
90
+ outputs=[doc_selector, upload_status, gr.State()]
91
+ )
92
+ # Reinitialize LLM when the model changes
93
+ model_selector.change(
94
+ app_config.gen_llm.reinitialize_llm,
95
+ inputs=[model_selector],
96
+ outputs=[upload_status]
97
+ )
98
+
99
+ # Middle Section (Chat & LLM Response)
100
+ with gr.Column(scale=6):
101
+ gr.Markdown("## Chat with document(s)")
102
+ chat_history = gr.Textbox(label="Chat History", interactive=False, lines=26, elem_id="chat-history", elem_classes=["chat-box"])
103
+ with gr.Row():
104
+ chat_input = gr.Textbox(label="Ask additional questions about the document...", show_label=False, placeholder="Ask additional questions about the document...", elem_id="chat-input", lines=3)
105
+ chat_btn = gr.Button("🚀 Send", variant="primary", elem_id="send-button", scale=0)
106
+ chat_btn.click(chat_response, inputs=[chat_input, doc_selector, chat_history], outputs=chat_history).then(
107
+ lambda: "", # Return an empty string to clear the chat_input
108
+ outputs=chat_input
109
+ )
110
+
111
+ # Right Sidebar (Sample Questions & History)
112
+ with gr.Column(scale=2):
113
+ gr.Markdown("## Frequently asked questions:")
114
+ with gr.Column():
115
+ gr.Examples(
116
+ examples=example_questions,
117
+ inputs=chat_input,
118
+ label=""
119
+ )
120
+ question_dropdown = gr.Dropdown(
121
+ label="",
122
+ choices=all_questions,
123
+ interactive=True,
124
+ info="Choose a question from the dropdown to populate the query box."
125
+ )
126
+
127
+ gr.Markdown("## Logs")
128
+ history = gr.Textbox(label="Previous Queries", interactive=False)
129
+
130
+ gr.HTML("""
131
+ <style>
132
+ .chat-box textarea {
133
+ max-height: 600px !important;
134
+ overflow-y: auto !important;
135
+ resize: vertical;
136
+ white-space: pre-wrap; /* Keeps formatting */
137
+ }
138
+ </style>
139
+ """)
140
+
141
+ if __name__ == "__main__":
142
+ interface.launch()
config/appConfig.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from retriever.llm_manager import LLMManager
3
+ from retriever.document_manager import DocumentManager
4
+
5
+ class AppConfig:
6
+ def __init__(self):
7
+ # Initialize LLMManager with the default model
8
+ self.gen_llm = LLMManager() # This will initialize the default model ("gemma2-9b-it")
9
+ # Initialize DocumentManager (it will be a singleton instance shared across the app)
10
+ self.doc_manager = DocumentManager()
11
+ logging.info("AppConfig initialized with LLMManager")
config/config.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class ConfigConstants:
3
+ # Constants related to datasets and models
4
+ DATA_SET_PATH= '/persistent/'
5
+ EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L3-v2"
6
+ RE_RANKER_MODEL_NAME = 'cross-encoder/ms-marco-electra-base'
7
+ GENERATION_MODEL_NAME = 'mixtral-8x7b-32768'
8
+ GENERATION_MODELS = ["llama3-8b-8192", "qwen-2.5-32b", "mixtral-8x7b-32768", "gemma2-9b-it" ]
9
+ DEFAULT_CHUNK_SIZE = 1000
10
+ CHUNK_OVERLAP = 200
data/document_loader.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # document_loader.py
2
+ import os
3
+ from typing import Optional
4
+
5
+ class DocumentLoader:
6
+ def __init__(self):
7
+ self.uploaded_file = None
8
+
9
+ def load_file(self, file_path: str) -> Optional[str]:
10
+ """
11
+ Load the uploaded PDF file and validate it
12
+ Returns the file path if valid, None otherwise
13
+ """
14
+ if not file_path:
15
+ return None
16
+
17
+ if not file_path.lower().endswith('.pdf'):
18
+ raise ValueError("Only PDF files are supported")
19
+
20
+ if not os.path.exists(file_path):
21
+ raise FileNotFoundError("File does not exist")
22
+
23
+ self.uploaded_file = file_path
24
+ return file_path
data/pdf_reader.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pdf_reader.py
2
+ import PyPDF2
3
+ from typing import List
4
+
5
+ class PDFReader:
6
+ def __init__(self):
7
+ self.page_list = []
8
+
9
+ def read_pdf(self, file_path: str) -> List[str]:
10
+ """
11
+ Read PDF content and return list of pages
12
+ Each element in the list is the text content of a page
13
+ """
14
+ try:
15
+ # Open and read the PDF file
16
+ with open(file_path, 'rb') as file:
17
+ pdf_reader = PyPDF2.PdfReader(file)
18
+ num_pages = len(pdf_reader.pages)
19
+
20
+ # Extract text from each page
21
+ self.page_list = []
22
+ for page_num in range(num_pages):
23
+ page = pdf_reader.pages[page_num]
24
+ text = page.extract_text()
25
+ if text: # Only add non-empty pages
26
+ self.page_list.append(text.strip())
27
+
28
+ return self.page_list
29
+
30
+ except Exception as e:
31
+ raise Exception(f"Error reading PDF: {str(e)}")
globals.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import logging
2
+ from config.appConfig import AppConfig
3
+
4
+ # Initialize AppConfig (this will initialize the LLMManager with the default model)
5
+ app_config = AppConfig()
6
+ logging.info("Global app_config initialized")
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers
2
+ torch
3
+ faiss-cpu
4
+ sentence-transformers
5
+ langchain
6
+ llama-index
7
+ langchain-community
8
+ langchain_groq
9
+ langchain-huggingface
10
+ gradio
11
+ PyPDF2
retriever/chat_manager.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List
3
+ from globals import app_config
4
+
5
+ def chat_response(query: str, selected_docs: List[str], history: str) -> str:
6
+ """
7
+ Generate a chat response based on the user's query and selected documents.
8
+
9
+ Args:
10
+ query (str): The user's query.
11
+ selected_docs (List[str]): List of selected document filenames from the dropdown.
12
+ history (str): The chat history.
13
+ model_name (str): The name of the LLM model to use for generation.
14
+
15
+ Returns:
16
+ str: Updated chat history with the new response.
17
+ """
18
+ if not query:
19
+ return history + "\n" + "Response: Please enter a query." if history else "Response: Please enter a query."
20
+
21
+ if not selected_docs:
22
+ return history + "\n" + "LLM: Please select at least one document." if history else "Response: Please select at least one document."
23
+
24
+ # Retrieve the top 5 chunks based on the query and selected documents
25
+ top_k_results = app_config.doc_manager.retrieve_top_k(query, selected_docs, k=5)
26
+
27
+ if not top_k_results:
28
+ return history + "\n" + f"User: {query}\nLLM: No relevant information found in the selected documents." if history else f"User: {query}\nLLM: No relevant information found in the selected documents."
29
+
30
+ # Send the top K results to the LLM to generate a response
31
+ try:
32
+ llm_response, source_docs = app_config.gen_llm.generate_response(query, top_k_results)
33
+ except Exception as e:
34
+ return history + "\n" + f"User: {query}\nLLM: Error generating response: {str(e)}" if history else f"User: {query}\nLLM: Error generating response: {str(e)}"
35
+
36
+ # Format the response for the chat history
37
+ response = f"{llm_response}\n"
38
+ '''for i, doc in enumerate(source_docs, 1):
39
+ doc_id = doc.metadata.get('doc_id', 'Unknown')
40
+ filename = next((name for name, d_id in app_config.doc_manager.document_ids.items() if d_id == doc_id), 'Unknown')
41
+ response += f"{i}. {filename}: {doc.page_content[:100]}...\n"'''
42
+
43
+ return history + "\n" + f"User: {query}\nResponse: {response}" if history else f"User: {query}\nResponse: {response}"
retriever/chunk_documents.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import hashlib
4
+
5
+ def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
6
+ """
7
+ Chunk a list of page contents into smaller segments with document ID metadata.
8
+
9
+ Args:
10
+ page_list (list): List of strings, each string being the content of a page.
11
+ doc_id (str): Unique identifier for the document.
12
+ chunk_size (int): Maximum size of each chunk (default: 1000 characters).
13
+ chunk_overlap (int): Overlap between chunks (default: 200 characters).
14
+
15
+ Returns:
16
+ list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.
17
+ """
18
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
19
+ documents = []
20
+ seen_hashes = set() # Track hashes of chunks to avoid duplicates
21
+
22
+ for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1
23
+ if not page_content or not isinstance(page_content, str):
24
+ continue # Skip empty or invalid pages
25
+
26
+ # Split the page content into chunks
27
+ chunks = text_splitter.split_text(page_content)
28
+
29
+ for i, chunk in enumerate(chunks):
30
+ # Generate a unique hash for the chunk
31
+ chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()
32
+
33
+ # Skip if the chunk is a duplicate
34
+ if chunk_hash in seen_hashes:
35
+ continue
36
+
37
+ # Create source identifier (e.g., "doc_123_page_1_chunk_0")
38
+ source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"
39
+
40
+ # Add the chunk with doc_id as metadata
41
+ documents.append({
42
+ 'text': chunk,
43
+ 'source': source,
44
+ 'doc_id': doc_id
45
+ })
46
+ seen_hashes.add(chunk_hash)
47
+
48
+ logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
49
+ return documents
retriever/document_manager.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import Any, Dict, List
4
+ import uuid
5
+ from data.document_loader import DocumentLoader
6
+ from data.pdf_reader import PDFReader
7
+ from retriever.chunk_documents import chunk_documents
8
+ from retriever.vector_store_manager import VectorStoreManager
9
+
10
+ class DocumentManager:
11
+ def __init__(self):
12
+ self.doc_loader = DocumentLoader()
13
+ self.pdf_reader = PDFReader()
14
+ self.vector_manager = VectorStoreManager()
15
+ self.uploaded_documents = {}
16
+ self.chunked_documents = {}
17
+ self.document_ids = {}
18
+ logging.info("DocumentManager initialized")
19
+
20
+ def process_document(self, file):
21
+ """
22
+ Process an uploaded file: load, read PDF, chunk, and store in vector store.
23
+ Returns: (status_message, page_list, filename, doc_id)
24
+ """
25
+ try:
26
+ if file is None:
27
+ return "No file uploaded", [], None, None
28
+
29
+ logging.info(f"Processing file: {file}")
30
+
31
+ # Load and validate file
32
+ file_path = self.doc_loader.load_file(file)
33
+ filename = os.path.basename(file_path)
34
+
35
+ # Read PDF content
36
+ page_list = self.pdf_reader.read_pdf(file_path)
37
+
38
+ # Store the uploaded document
39
+ self.uploaded_documents[filename] = file_path
40
+
41
+ # Generate a unique document ID
42
+ doc_id = str(uuid.uuid4())
43
+ self.document_ids[filename] = doc_id
44
+
45
+ # Chunk the pages
46
+ chunks = chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200)
47
+ self.chunked_documents[filename] = chunks
48
+
49
+ # Add chunks to vector store
50
+ self.vector_manager.add_documents(chunks)
51
+
52
+ return (
53
+ f"Successfully loaded {filename} with {len(page_list)} pages",
54
+ page_list,
55
+ filename,
56
+ doc_id
57
+ )
58
+
59
+ except Exception as e:
60
+ logging.error(f"Error processing document: {str(e)}")
61
+ return f"Error: {str(e)}", [], None, None
62
+
63
+ def get_uploaded_documents(self):
64
+ """Return the list of uploaded document filenames."""
65
+ return list(self.uploaded_documents.keys())
66
+
67
+ def get_chunks(self, filename):
68
+ """Return chunks for a given filename."""
69
+ return self.chunked_documents.get(filename, [])
70
+
71
+ def get_document_id(self, filename):
72
+ """Return the document ID for a given filename."""
73
+ return self.document_ids.get(filename, None)
74
+
75
+ def retrieve_top_k(self, query: str, selected_docs: List[str], k: int = 5) -> List[Dict[str, Any]]:
76
+ """
77
+ Retrieve the top K chunks across the selected documents based on the user's query.
78
+
79
+ Args:
80
+ query (str): The user's query.
81
+ selected_docs (List[str]): List of selected document filenames from the dropdown.
82
+ k (int): Number of top results to return (default is 5).
83
+
84
+ Returns:
85
+ List[Dict[str, Any]]: List of top K chunks with their text, metadata, and scores.
86
+ """
87
+ if not selected_docs:
88
+ logging.warning("No documents selected for retrieval")
89
+ return []
90
+
91
+ all_results = []
92
+ for filename in selected_docs:
93
+ doc_id = self.get_document_id(filename)
94
+ if not doc_id:
95
+ logging.warning(f"No document ID found for filename: {filename}")
96
+ continue
97
+
98
+ # Search for relevant chunks within this document
99
+ results = self.vector_manager.search(query, doc_id, k=k)
100
+ all_results.extend(results)
101
+
102
+ # Sort all results by score in descending order and take the top K
103
+ all_results.sort(key=lambda x: x['score'], reverse=True)
104
+ top_k_results = all_results[:k]
105
+
106
+ # Log the list of retrieved documents
107
+ logging.info("Retrieved top K documents:")
108
+ for i, result in enumerate(top_k_results, 1):
109
+ doc_id = result['metadata'].get('doc_id', 'Unknown')
110
+ filename = next((name for name, d_id in self.document_ids.items() if d_id == doc_id), 'Unknown')
111
+ logging.info(f"{i}. Filename: {filename}, Doc ID: {doc_id}, Score: {result['score']:.4f}, Text: {result['text'][:100]}...")
112
+
113
+ return top_k_results
retriever/llm_manager.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Dict, Any, Tuple
4
+ from langchain_groq import ChatGroq
5
+ from langchain.chains import RetrievalQA
6
+ from langchain_core.documents import Document
7
+ from langchain_core.retrievers import BaseRetriever
8
+
9
+ class LLMManager:
10
+ DEFAULT_MODEL = "gemma2-9b-it" # Set the default model name
11
+
12
+ def __init__(self):
13
+ self.generation_llm = None
14
+ logging.info("LLMManager initialized")
15
+
16
+ # Initialize the default model during construction
17
+ try:
18
+ self.initialize_generation_llm(self.DEFAULT_MODEL)
19
+ logging.info(f"Initialized default LLM model: {self.DEFAULT_MODEL}")
20
+ except ValueError as e:
21
+ logging.error(f"Failed to initialize default LLM model: {str(e)}")
22
+
23
+ def initialize_generation_llm(self, model_name: str) -> None:
24
+ """
25
+ Initialize the generation LLM using the Groq API.
26
+
27
+ Args:
28
+ model_name (str): The name of the model to use for generation.
29
+
30
+ Raises:
31
+ ValueError: If GROQ_API_KEY is not set.
32
+ """
33
+ api_key = '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' #os.getenv("GROQ_API_KEY")
34
+ if not api_key:
35
+ raise ValueError("GROQ_API_KEY is not set. Please add it in your environment variables.")
36
+
37
+ os.environ["GROQ_API_KEY"] = api_key
38
+ self.generation_llm = ChatGroq(model=model_name, temperature=0.7)
39
+ self.generation_llm.name = model_name
40
+ logging.info(f"Generation LLM {model_name} initialized")
41
+
42
+ def reinitialize_llm(self, model_name: str) -> str:
43
+ """
44
+ Reinitialize the LLM with a new model name.
45
+
46
+ Args:
47
+ model_name (str): The name of the new model to initialize.
48
+
49
+ Returns:
50
+ str: Status message indicating success or failure.
51
+ """
52
+ try:
53
+ self.initialize_generation_llm(model_name)
54
+ return f"LLM model changed to {model_name}"
55
+ except ValueError as e:
56
+ logging.error(f"Failed to reinitialize LLM with model {model_name}: {str(e)}")
57
+ return f"Error: Failed to change LLM model: {str(e)}"
58
+
59
+ def generate_response(self, question: str, relevant_docs: List[Dict[str, Any]]) -> Tuple[str, List[Document]]:
60
+ """
61
+ Generate a response using the generation LLM based on the question and relevant documents.
62
+
63
+ Args:
64
+ question (str): The user's query.
65
+ relevant_docs (List[Dict[str, Any]]): List of relevant document chunks with text, metadata, and scores.
66
+
67
+ Returns:
68
+ Tuple[str, List[Document]]: The LLM's response and the source documents used.
69
+
70
+ Raises:
71
+ ValueError: If the generation LLM is not initialized.
72
+ Exception: If there's an error during the QA chain invocation.
73
+ """
74
+ if not self.generation_llm:
75
+ raise ValueError("Generation LLM is not initialized. Call initialize_generation_llm first.")
76
+
77
+ # Convert the relevant documents into LangChain Document objects
78
+ documents = [
79
+ Document(page_content=doc['text'], metadata=doc['metadata'])
80
+ for doc in relevant_docs
81
+ ]
82
+
83
+ # Create a proper retriever by subclassing BaseRetriever
84
+ class SimpleRetriever(BaseRetriever):
85
+ def __init__(self, docs: List[Document], **kwargs):
86
+ super().__init__(**kwargs) # Pass kwargs to BaseRetriever
87
+ self._docs = docs # Use a private attribute to store docs
88
+ logging.debug(f"SimpleRetriever initialized with {len(docs)} documents")
89
+
90
+ def _get_relevant_documents(self, query: str) -> List[Document]:
91
+ logging.debug(f"SimpleRetriever._get_relevant_documents called with query: {query}")
92
+ return self._docs
93
+
94
+ async def _aget_relevant_documents(self, query: str) -> List[Document]:
95
+ logging.debug(f"SimpleRetriever._aget_relevant_documents called with query: {query}")
96
+ return self._docs
97
+
98
+ # Instantiate the retriever
99
+ retriever = SimpleRetriever(docs=documents)
100
+
101
+ # Create a retrieval-based question-answering chain
102
+ qa_chain = RetrievalQA.from_chain_type(
103
+ llm=self.generation_llm,
104
+ retriever=retriever,
105
+ return_source_documents=True
106
+ )
107
+
108
+ try:
109
+ result = qa_chain.invoke({"query": question})
110
+ response = result['result']
111
+ source_docs = result['source_documents']
112
+ logging.info(f"Generated response for question: {question} : {response}")
113
+ return response, source_docs
114
+ except Exception as e:
115
+ logging.error(f"Error during QA chain invocation: {str(e)}")
116
+ raise e
retriever/vector_store_manager.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from config.config import ConfigConstants
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_community.vectorstores import FAISS
6
+
7
+ class VectorStoreManager:
8
+ def __init__(self, embedding_path="embeddings.faiss"):
9
+ """
10
+ Initialize the vector store manager.
11
+
12
+ Args:
13
+ embedding_path (str): Path to save/load the FAISS index.
14
+ """
15
+ self.embedding_path = embedding_path
16
+ self.embedding_model = HuggingFaceEmbeddings(model_name=ConfigConstants.EMBEDDING_MODEL_NAME)
17
+ self.vector_store = self._initialize_vector_store()
18
+
19
+ def _initialize_vector_store(self):
20
+ """Initialize or load the FAISS vector store."""
21
+ if os.path.exists(self.embedding_path):
22
+ logging.info("Loading embeddings from local file")
23
+ return FAISS.load_local(
24
+ self.embedding_path,
25
+ self.embedding_model,
26
+ allow_dangerous_deserialization=True
27
+ )
28
+ else:
29
+ logging.info("Creating new vector store")
30
+ # Return an empty vector store; it will be populated when documents are added
31
+ return FAISS.from_texts(
32
+ texts=[""], # Dummy text to initialize
33
+ embedding=self.embedding_model,
34
+ metadatas=[{"source": "init", "doc_id": "init"}]
35
+ )
36
+
37
+ def add_documents(self, documents):
38
+ """
39
+ Add new documents to the vector store and save it.
40
+
41
+ Args:
42
+ documents (list): List of dictionaries with 'text', 'source', and 'doc_id'.
43
+ """
44
+ if not documents:
45
+ return
46
+
47
+ texts = [doc['text'] for doc in documents]
48
+ metadatas = [{'source': doc['source'], 'doc_id': doc['doc_id']} for doc in documents]
49
+
50
+ logging.info("Adding new documents to vector store")
51
+ self.vector_store.add_texts(
52
+ texts=texts,
53
+ metadatas=metadatas
54
+ )
55
+ self.vector_store.save_local(self.embedding_path)
56
+ logging.info(f"Vector store updated and saved to {self.embedding_path}")
57
+
58
+ def search(self, query, doc_id, k=4):
59
+ """
60
+ Search the vector store for relevant chunks, filtered by doc_id.
61
+
62
+ Args:
63
+ query (str): The user's query.
64
+ doc_id (str): The document ID to filter by.
65
+ k (int): Number of results to return.
66
+
67
+ Returns:
68
+ list: List of relevant document chunks with metadata and scores.
69
+ """
70
+ if not self.vector_store:
71
+ return []
72
+
73
+ try:
74
+
75
+ # Define a filter function to match doc_id
76
+ filter_fn = lambda metadata: metadata['doc_id'] == doc_id
77
+
78
+ # Perform similarity search with filter
79
+ results = self.vector_store.similarity_search_with_score(
80
+ query=query,
81
+ k=k,
82
+ filter=filter_fn
83
+ )
84
+
85
+ # Format results
86
+ return [{'text': doc.page_content, 'metadata': doc.metadata, 'score': score} for doc, score in results]
87
+
88
+ except Exception as e:
89
+ logging.error(f"Error during vector store search: {str(e)}")
90
+ return []
utils/document_utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List
3
+
4
+ logs = []
5
+ class Document:
6
+ def __init__(self, metadata, page_content):
7
+ self.metadata = metadata
8
+ self.page_content = page_content
9
+
10
+ def apply_sentence_keys_documents(relevant_docs: List[Document]):
11
+ result = []
12
+ '''for i, doc in enumerate(relevant_docs):
13
+ doc_id = str(i)
14
+ title_passage = doc.page_content.split('\nPassage: ')
15
+ title = title_passage[0]
16
+ passages = title_passage[1].split('. ')
17
+
18
+ doc_result = []
19
+ doc_result.append([f"{doc_id}a", title])
20
+
21
+ for j, passage in enumerate(passages):
22
+ doc_result.append([f"{doc_id}{chr(98 + j)}", passage])
23
+
24
+ result.append(doc_result)'''
25
+
26
+ for relevant_doc_index, relevant_doc in enumerate(relevant_docs):
27
+ sentences = []
28
+ for sentence_index, sentence in enumerate(relevant_doc.page_content.split(".")):
29
+ sentences.append([str(relevant_doc_index)+chr(97 + sentence_index), sentence])
30
+ result.append(sentences)
31
+
32
+ return result
33
+
34
+ def apply_sentence_keys_response(input_string):
35
+ sentences = input_string.split('. ')
36
+ result = [[chr(97 + i), sentence] for i, sentence in enumerate(sentences)]
37
+ return result
38
+
39
+ def initialize_logging():
40
+ logger = logging.getLogger()
41
+ logger.setLevel(logging.INFO)
42
+
43
+ # Custom log handler to capture logs and add them to the logs list
44
+ class LogHandler(logging.Handler):
45
+ def emit(self, record):
46
+ log_entry = self.format(record)
47
+ logs.append(log_entry)
48
+
49
+ # Add custom log handler to the logger
50
+ log_handler = LogHandler()
51
+ log_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
52
+ logger.addHandler(log_handler)
53
+
54
+ def get_logs():
55
+ """Retrieve logs for display."""
56
+ return "\n".join(logs[-100:]) # Only show the last 50 logs for example