Spaces:
Running
Running
File size: 16,040 Bytes
c55ed56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 |
import streamlit as st
import os
from PIL import Image
import pytesseract
from pdf2image import convert_from_path
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory
from langchain_groq import ChatGroq
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import VectorStoreRetriever
import streamlit.components.v1 as components
from streamlit_pdf_viewer import pdf_viewer
from io import BytesIO
import base64
if 'pdf_ref' not in st.session_state:
st.session_state.pdf_ref = None
# Initialize the Groq API Key and the model
os.environ["GROQ_API_KEY"] = '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'
# config = {'max_new_tokens': 512, 'context_length': 8000}
llm = ChatGroq(
model='llama3-70b-8192',
temperature=0.5,
max_tokens=None,
timeout=None,
max_retries=2
)
# Define OCR functions for image and PDF files
def ocr_image(image_path, language='eng+guj'):
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang=language)
return text
def ocr_pdf(pdf_path, language='eng+guj'):
images = convert_from_path(pdf_path)
all_text = ""
for img in images:
text = pytesseract.image_to_string(img, lang=language)
all_text += text + "\n"
return all_text
def ocr_file(file_path):
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == ".pdf":
text_re = ocr_pdf(file_path, language='guj+eng')
elif file_extension in [".jpg", ".jpeg", ".png", ".bmp"]:
text_re = ocr_image(file_path, language='guj+eng')
else:
raise ValueError("Unsupported file format. Supported formats are PDF, JPG, JPEG, PNG, BMP.")
return text_re
def get_text_chunks(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_text(text)
return chunks
# Function to create or update the vector store
def get_vector_store(text_chunks):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
# Ensure the directory exists before saving the vector store
os.makedirs("faiss_index", exist_ok=True)
vector_store.save_local("faiss_index")
return vector_store
# Function to process multiple files and extract vector store
def process_ocr_and_pdf_files(file_paths):
raw_text = ""
for file_path in file_paths:
raw_text += ocr_file(file_path) + "\n"
text_chunks = get_text_chunks(raw_text)
return get_vector_store(text_chunks)
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
# new_vector_store = FAISS.load_local(
# "faiss_index", embeddings, allow_dangerous_deserialization=True
# )
# docs = new_vector_store.similarity_search("qux")
# Conversational chain for Q&A
def get_conversational_chain():
template = """Core Identity & Responsibilities
Role: Official AI Assistant for Admission Committee for Professional Courses (ACPC), Gujarat
Mission: Process OCR-extracted text and provide clear, direct guidance on admissions and scholarships
Focus: Deliver user-friendly responses while handling OCR complexities internally
Processing Framework
1. Text & Document Processing
Process OCR-extracted text from various document types with attention to tables and structured data
Internally identify and handle OCR errors without explicitly mentioning them unless critical
Preserve tabular structures and relationships between data points
Present information in clean, readable formats regardless of source OCR quality
2. Language Handling
Support seamless communication in both Gujarati and English
Respond in the same language as the user's query
Present technical terms in both languages when relevant
Adjust language complexity to user comprehension level
3. Response Principles
Provide direct, concise answers (2-3 sentences for simple queries)
Skip unnecessary OCR quality disclaimers unless information is critically ambiguous
Present information in user-friendly formats, especially for tables and numerical data
Maintain professional yet conversational tone
Query Handling Strategies
1. Direct Information Queries
Provide straightforward answers without mentioning OCR processing
Example:
User: "What is the last date for application submission?"
Response: "The last date for application submission is June 15, 2025."
(NOT: "Based on the OCR-processed text, the last date appears to be...")
2. Table Data Extraction
Present tabular information in clean, structured format
Preserve relationships between data points
Example:
User: "What are the fees for different courses?"
Response:
"The fees for various courses are:
B.Tech: ₹1,15,000 (General), ₹58,000 (SC/ST)
B.Pharm: ₹85,000 (General), ₹42,500 (SC/ST)"
(NOT: "According to the OCR-extracted table, which may have quality issues...")
3. Ambiguous Information Handling
If OCR quality affects critical information (like dates, amounts, eligibility):
Provide the most likely correct information
Add a brief note suggesting verification only for critical information
Example: "The application deadline is June 15, 2025. For this important deadline, we recommend confirming on the official ACPC website."
4. Uncertain Information Protocol
For critically unclear OCR content:
State the most probable information
Add a simple verification suggestion without mentioning OCR
Example: "Based on the available information, the income limit appears to be ₹6,00,000. For this critical criterion, please verify on the official ACPC portal."
5. Structured Document Navigation
Present information in the same logical structure as the original document
Use headings and bullet points for clarity when appropriate
Maintain document hierarchies when explaining multi-step processes
6. Out-of-Scope Queries
Politely redirect without mentioning document or OCR limitations
Example: "This query is outside the scope of ACPC admission guidelines. For information about [topic], please contact [appropriate authority]."
7. Key Information Emphasis
Highlight critical information like deadlines, eligibility criteria, and document requirements
Make important numerical data visually distinct
Prioritize accuracy for dates, amounts, and eligibility requirements
8. Multi-Part Query Handling
Address each component of multi-part queries separately
Maintain logical flow between related pieces of information
Preserve context when explaining complex processes
9. Completeness Guidelines
Ensure responses cover all aspects of user queries
Provide step-by-step guidance for procedural questions
Include relevant related information that users might need
10. Response Quality Control
Internally verify numerical data consistency
Apply contextual understanding to identify potential OCR errors without mentioning them
Present information with confidence unless critically uncertain
Focus on delivering actionable information rather than discussing document limitations
Input:
OCR-processed text from uploaded documents: {context}
Chat History: {history}
Current Question: {question}
Output:
Give a clear, direct, and user-friendly response that focuses on the information itself rather than its OCR source. Present information confidently, mentioning verification only for critically important or potentially ambiguous details.
"""
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
new_vector_store = FAISS.load_local(
"faiss_index", embeddings, allow_dangerous_deserialization=True
)
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["history", "context", "question"], template=template)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=new_vector_store.as_retriever(), chain_type='stuff', verbose=True, chain_type_kwargs={"verbose": True,"prompt": QA_CHAIN_PROMPT,"memory": ConversationBufferMemory(memory_key="history",input_key="question"),})
return qa_chain
def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
file_path = os.path.join("temp", uploaded_file.name)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
# Show document in the main panel and optionally in the sidebar
if show_in_sidebar:
st.sidebar.write(f"### File: {uploaded_file.name}")
# if file_extension == ".pdf":
# st.session_state.pdf_ref = uploaded_file # Save the PDF to session state
# binary_data = st.session_state.pdf_ref.getvalue() # Get the binary data of the PDF
# # Use the pdf_viewer to display the PDF
# # sidebar.pdf_viewer(input=binary_data, width=700)
if file_extension == ".pdf":
# Display the PDF in the sidebar by embedding the PDF file
with open(file_path, "rb") as pdf_file:
pdf_data = pdf_file.read()
# Use the HTML iframe to display the PDF in the sidebar
pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
st.sidebar.markdown(f'<iframe src="data:application/pdf;base64,{pdf_base64}" width="500" height="500"></iframe>', unsafe_allow_html=True)
elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
img = Image.open(file_path)
st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_container_width=True) # Updated here
else:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
st.sidebar.text_area("File Content", content, height=300)
# Optionally show document in the main content area
# st.write(f"### Main Panel - {uploaded_file.name}")
# if file_extension == '.pdf':
# st.write("Displaying PDF:")
# st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# img = Image.open(file_path)
# st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
# else:
# with open(file_path, 'r', encoding='utf-8') as f:
# content = f.read()
# st.text_area("File Content", content, height=300)
def user_input(user_question):
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True})
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
docs = new_db.similarity_search(user_question)
chain = get_conversational_chain()
response = chain({"input_documents": docs, "query": user_question}, return_only_outputs=True)
result = response.get("result", "No result found")
# Save the question and answer to session state for history tracking
if 'conversation_history' not in st.session_state:
st.session_state.conversation_history = []
# Append new question and response to the history
st.session_state.conversation_history.append({'question': user_question, 'answer': result})
return result
# def handle_uploaded_file(uploaded_file, show_in_sidebar=False):
# file_extension = os.path.splitext(uploaded_file.name)[1].lower()
# file_path = os.path.join("temp", uploaded_file.name)
# os.makedirs(os.path.dirname(file_path), exist_ok=True)
# with open(file_path, "wb") as f:
# f.write(uploaded_file.getbuffer())
# # Show document in the main panel and optionally in the sidebar
# if show_in_sidebar:
# st.sidebar.write(f"### File: {uploaded_file.name}")
# if file_extension == '.pdf':
# st.sidebar.write("Displaying PDF:")
# st.sidebar.components.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# # st.sidebar.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# img = Image.open(file_path)
# st.sidebar.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
# else:
# with open(file_path, 'r', encoding='utf-8') as f:
# content = f.read()
# st.sidebar.text_area("File Content", content, height=300)
# Optionally show document in the main content area
# st.write(f"### Main Panel - {uploaded_file.name}")
# if file_extension == '.pdf':
# st.write("Displaying PDF:")
# st.components.v1.html(f'<embed src="{file_path}" width="700" height="500" type="application/pdf">')
# elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp']:
# img = Image.open(file_path)
# st.image(img, caption=f"Uploaded Image: {uploaded_file.name}", use_column_width=True)
# else:
# with open(file_path, 'r', encoding='utf-8') as f:
# content = f.read()
# st.text_area("File Content", content, height=300)
# Streamlit app to upload files and interact with the Q&A system
def main():
st.title("File Upload and OCR Processing")
st.write("Upload up to 5 files (PDF, JPG, JPEG, PNG, BMP)")
uploaded_files = st.file_uploader("Choose files", type=["pdf", "jpg", "jpeg", "png", "bmp"], accept_multiple_files=True)
if len(uploaded_files) > 0:
file_paths = []
# Save uploaded files and process them
for uploaded_file in uploaded_files[:5]: # Limit to 5 files
file_path = os.path.join("temp", uploaded_file.name)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
file_paths.append(file_path)
# Process the OCR and PDF files and store the vector data
st.write("Processing files...")
vector_store = process_ocr_and_pdf_files(file_paths)
st.write("Processing completed! The vector store has been updated.")
show_in_sidebar = st.sidebar.checkbox("Show files in Sidebar", value=True)
if len(uploaded_files) > 0:
# Process and display each uploaded file in its format
for uploaded_file in uploaded_files:
handle_uploaded_file(uploaded_file, show_in_sidebar)
# Ask user for a question related to the documents
user_question = st.text_input("Ask a question related to the uploaded documents:")
if user_question:
response = user_input(user_question)
st.write("Answer:", response)
# Button to display chat history
# if st.button("Show Chat History"):
# history = st.session_state.get('history', [])
# if history:
# st.write("Conversation History:")
# for idx, (q, a) in enumerate(history):
# st.write(f"Q{idx+1}: {q}")
# st.write(f"A{idx+1}: {a}")
# else:
# st.write("No conversation history.")
with st.expander('Conversation History'):
for entry in st.session_state.conversation_history:
st.info(f"Q: {entry['question']}\nA: {entry['answer']}")
if __name__ == "__main__":
main()
|