vikramronavrsc commited on
Commit
dbb1b59
Β·
verified Β·
1 Parent(s): 1169500

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +356 -573
app.py CHANGED
@@ -1,235 +1,28 @@
1
- # app.py - Optimized for Hugging Face Spaces
2
  import os
3
  import tempfile
4
  import shutil
5
  import PyPDF2
6
  import streamlit as st
7
  import torch
 
 
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain_community.vectorstores import FAISS
10
  from langchain.chains import RetrievalQA
11
  from langchain.docstore.document import Document
12
  from langchain.prompts import PromptTemplate
13
- from langchain_huggingface import HuggingFaceEmbeddings
14
- from langchain_community.llms import HuggingFaceHub
15
  import time
16
  import psutil
17
  import uuid
18
  import atexit
19
- import json
20
- import hashlib
21
- from web3 import Web3
22
-
23
- # Set page configuration
24
- st.set_page_config(
25
- page_title="RAG System",
26
- layout="wide",
27
- initial_sidebar_state="expanded"
28
- )
29
-
30
- # Custom CSS for better UI
31
- def load_css():
32
- st.markdown("""
33
- <style>
34
- /* Main layout styling */
35
- .main {
36
- background-color: #f9fafb;
37
- }
38
-
39
- /* Card styling */
40
- .card {
41
- border-radius: 10px;
42
- background-color: white;
43
- padding: 20px;
44
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.05);
45
- margin-bottom: 20px;
46
- }
47
-
48
- /* Two-column layout */
49
- .answer-section {
50
- background-color: white;
51
- border-radius: 10px;
52
- padding: 20px;
53
- margin-bottom: 15px;
54
- border-left: 4px solid #4CAF50;
55
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
56
- }
57
-
58
- .sources-section {
59
- background-color: white;
60
- border-radius: 10px;
61
- padding: 15px;
62
- margin-bottom: 15px;
63
- border-left: 4px solid #2196F3;
64
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.05);
65
- }
66
-
67
- .source-item {
68
- padding: 10px;
69
- border-radius: 5px;
70
- background-color: #f8f9fa;
71
- margin-bottom: 10px;
72
- border: 1px solid #eee;
73
- }
74
-
75
- .source-header {
76
- font-weight: bold;
77
- margin-bottom: 5px;
78
- display: flex;
79
- justify-content: space-between;
80
- }
81
-
82
- .verified-badge {
83
- background-color: #4CAF50;
84
- color: white;
85
- padding: 2px 8px;
86
- border-radius: 10px;
87
- font-size: 0.8em;
88
- }
89
-
90
- /* Method selection styling */
91
- .method-container {
92
- display: flex;
93
- gap: 10px;
94
- margin-bottom: 15px;
95
- }
96
-
97
- .method-button {
98
- flex: 1;
99
- text-align: center;
100
- padding: 10px;
101
- border-radius: 5px;
102
- cursor: pointer;
103
- transition: all 0.3s;
104
- }
105
-
106
- .direct-method {
107
- background-color: #e3f2fd;
108
- border: 1px solid #bbdefb;
109
- color: #1976D2;
110
- }
111
-
112
- .direct-method:hover {
113
- background-color: #bbdefb;
114
- }
115
-
116
- .enhanced-method {
117
- background-color: #e8f5e9;
118
- border: 1px solid #c8e6c9;
119
- color: #388E3C;
120
- }
121
-
122
- .enhanced-method:hover {
123
- background-color: #c8e6c9;
124
- }
125
-
126
- .method-active {
127
- box-shadow: 0 0 0 2px #3f51b5;
128
- }
129
-
130
- /* Voice button styling */
131
- .voice-button {
132
- width: 50px;
133
- height: 50px;
134
- border-radius: 50%;
135
- background-color: #f44336;
136
- color: white;
137
- display: flex;
138
- align-items: center;
139
- justify-content: center;
140
- cursor: pointer;
141
- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
142
- transition: all 0.3s;
143
- margin: 0 auto;
144
- }
145
-
146
- .voice-button:hover {
147
- transform: scale(1.05);
148
- box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
149
- }
150
-
151
- /* Header styling */
152
- h1, h2, h3 {
153
- color: #333;
154
- }
155
-
156
- /* Button styling */
157
- .stButton>button {
158
- border-radius: 5px;
159
- font-weight: 500;
160
- }
161
- </style>
162
- """, unsafe_allow_html=True)
163
 
164
- # Simple blockchain utility
165
- class BlockchainVerifier:
166
- def __init__(self, contract_address=None):
167
- self.contract_address = contract_address
168
- self.is_connected = False
169
- self.user_address = None
170
-
171
- def connect_wallet(self, wallet_address):
172
- """Simulate connecting to a wallet"""
173
- self.is_connected = True
174
- self.user_address = wallet_address
175
- return True
176
-
177
- def compute_file_hash(self, file_path):
178
- """Compute SHA-256 hash of file"""
179
- sha256_hash = hashlib.sha256()
180
- with open(file_path, "rb") as f:
181
- for byte_block in iter(lambda: f.read(4096), b""):
182
- sha256_hash.update(byte_block)
183
- return sha256_hash.hexdigest()
184
-
185
- def verify_document(self, document_id, file_path):
186
- """Simulate document verification on blockchain"""
187
- if not self.is_connected:
188
- return {"status": False, "error": "Wallet not connected"}
189
-
190
- # Calculate hash
191
- document_hash = self.compute_file_hash(file_path)
192
-
193
- # Simulate transaction
194
- tx_hash = "0x" + "".join([format(i, "02x") for i in os.urandom(32)])
195
-
196
- return {
197
- "status": True,
198
- "tx_hash": tx_hash,
199
- "document_id": document_id,
200
- "document_hash": document_hash,
201
- "block_number": 12345678
202
- }
203
-
204
- def log_query(self, query_text, answer_text):
205
- """Simulate logging a query on blockchain"""
206
- if not self.is_connected:
207
- return {"status": False, "error": "Wallet not connected"}
208
-
209
- # Create query data and hash
210
- query_id = f"query_{int(time.time())}"
211
- query_data = {
212
- "query": query_text,
213
- "answer": answer_text,
214
- "timestamp": int(time.time())
215
- }
216
- query_hash = hashlib.sha256(json.dumps(query_data).encode()).hexdigest()
217
-
218
- # Simulate transaction
219
- tx_hash = "0x" + "".join([format(i, "02x") for i in os.urandom(32)])
220
-
221
- return {
222
- "status": True,
223
- "tx_hash": tx_hash,
224
- "query_id": query_id,
225
- "query_hash": query_hash,
226
- "block_number": 12345678
227
- }
228
 
229
- # RAG System Class
230
- class OptimizedRAG:
231
  def __init__(self,
232
- llm_model_name="google/flan-t5-base",
233
  embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
234
  chunk_size=1000,
235
  chunk_overlap=200,
@@ -237,7 +30,16 @@ class OptimizedRAG:
237
  use_blockchain=False,
238
  contract_address=None):
239
  """
240
- Initialize the RAG system optimized for Hugging Face Spaces
 
 
 
 
 
 
 
 
 
241
  """
242
  self.llm_model_name = llm_model_name
243
  self.embedding_model_name = embedding_model_name
@@ -246,6 +48,7 @@ class OptimizedRAG:
246
 
247
  # Device selection for embeddings
248
  self.device = "cuda" if self.use_gpu else "cpu"
 
249
 
250
  # Initialize text splitter
251
  self.text_splitter = RecursiveCharacterTextSplitter(
@@ -260,7 +63,7 @@ class OptimizedRAG:
260
  model_kwargs={"device": self.device}
261
  )
262
 
263
- # Initialize LLM using HuggingFaceHub
264
  try:
265
  # Use HF_TOKEN from environment variables
266
  hf_token = os.environ.get("HF_TOKEN")
@@ -270,7 +73,7 @@ class OptimizedRAG:
270
  self.llm = HuggingFaceHub(
271
  repo_id=llm_model_name,
272
  huggingfacehub_api_token=hf_token,
273
- model_kwargs={"temperature": 0.7, "max_length": 512}
274
  )
275
  except Exception as e:
276
  st.error(f"Error initializing LLM: {str(e)}")
@@ -278,42 +81,60 @@ class OptimizedRAG:
278
  # Fallback to a smaller model
279
  self.llm = HuggingFaceHub(
280
  repo_id="google/flan-t5-small",
281
- model_kwargs={"temperature": 0.7, "max_length": 256}
282
  )
283
 
284
- # Initialize vector store and stats
285
  self.vector_store = None
286
  self.documents_processed = 0
 
 
287
  self.processing_times = {}
288
 
289
- # Initialize blockchain verifier
290
  self.blockchain = None
291
  if use_blockchain:
292
- self.blockchain = BlockchainVerifier(contract_address=contract_address)
 
 
 
 
 
 
 
293
 
294
- def connect_wallet(self, wallet_address):
295
- """Connect wallet for blockchain verification"""
296
- if self.blockchain:
297
- return self.blockchain.connect_wallet(wallet_address)
 
 
 
 
 
298
  return False
299
 
300
  def process_pdfs(self, pdf_files):
301
- """Process PDF files and create vector store"""
302
  all_docs = []
303
 
304
  with st.status("Processing PDF files...") as status:
305
- # Create temporary directory
306
  temp_dir = tempfile.mkdtemp()
307
  st.session_state['temp_dir'] = temp_dir
308
 
309
- # Track processing stats
310
  start_time = time.time()
 
 
311
  mem_before = psutil.virtual_memory().used / (1024 * 1024 * 1024) # GB
312
 
313
- # Process each PDF
314
  for i, pdf_file in enumerate(pdf_files):
315
  try:
316
- # Save uploaded file
 
 
317
  pdf_path = os.path.join(temp_dir, pdf_file.name)
318
  with open(pdf_path, "wb") as f:
319
  f.write(pdf_file.getbuffer())
@@ -330,130 +151,107 @@ class OptimizedRAG:
330
  if page_text:
331
  text += page_text + "\n\n"
332
 
333
- # Create and split documents
334
  docs = [Document(page_content=text, metadata={"source": pdf_file.name})]
 
 
335
  split_docs = self.text_splitter.split_documents(docs)
 
336
  all_docs.extend(split_docs)
337
 
338
- # Verify on blockchain if enabled
339
  if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
340
- document_id = f"{pdf_file.name}_{uuid.uuid4().hex[:8]}"
341
- verification = self.blockchain.verify_document(document_id, pdf_path)
342
-
343
- if verification.get('status'):
344
- st.sidebar.success(f"βœ… {pdf_file.name} verified on blockchain")
 
 
345
 
346
- # Add blockchain metadata
347
- for doc in split_docs:
348
- doc.metadata["blockchain"] = {
349
- "verified": True,
350
- "document_id": document_id,
351
- "document_hash": verification.get("document_hash", ""),
352
- "tx_hash": verification.get("tx_hash", ""),
353
- "block_number": verification.get("block_number", 0)
354
- }
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  except Exception as e:
357
  st.sidebar.error(f"Error processing {pdf_file.name}: {str(e)}")
358
 
359
- # Create vector store
360
  if all_docs:
361
  status.update(label="Building vector index...")
362
  try:
 
363
  index_start_time = time.time()
 
 
364
  self.vector_store = FAISS.from_documents(all_docs, self.embeddings)
365
- index_time = time.time() - index_start_time
366
 
367
- # Track memory usage
368
- mem_after = psutil.virtual_memory().used / (1024 * 1024 * 1024)
 
 
 
369
  mem_used = mem_after - mem_before
370
 
371
- # Save performance metrics
372
  total_time = time.time() - start_time
 
 
 
 
373
  self.processing_times["index_building"] = index_time
374
  self.processing_times["total_time"] = total_time
375
  self.processing_times["memory_used_gb"] = mem_used
376
  self.documents_processed = len(all_docs)
377
 
378
- status.update(label=f"Completed processing {len(all_docs)} chunks", state="complete")
379
  return True
380
  except Exception as e:
381
  st.error(f"Error creating vector store: {str(e)}")
 
382
  return False
383
  else:
384
  status.update(label="No content extracted from PDFs", state="error")
385
  return False
386
 
387
- def direct_retrieval(self, query):
388
- """Direct retrieval method - returns raw document chunks"""
389
  if not self.vector_store:
390
  return "Please upload and process PDF files first."
391
 
392
  try:
393
- # Start timing
394
- query_start_time = time.time()
395
-
396
- # Retrieve relevant documents
397
- retriever = self.vector_store.as_retriever(search_kwargs={"k": 5})
398
- docs = retriever.get_relevant_documents(query)
399
-
400
- # Format sources and answer
401
- sources = []
402
- answer = "Here are the most relevant passages:\n\n"
403
-
404
- for i, doc in enumerate(docs):
405
- # Get blockchain info if available
406
- blockchain_info = None
407
- if "blockchain" in doc.metadata:
408
- blockchain_info = {
409
- "verified": doc.metadata["blockchain"]["verified"],
410
- "document_id": doc.metadata["blockchain"]["document_id"],
411
- "tx_hash": doc.metadata["blockchain"]["tx_hash"]
412
- }
413
-
414
- # Add to answer and sources
415
- answer += f"Passage {i+1} (from {doc.metadata.get('source', 'Unknown')}):\n{doc.page_content}\n\n"
416
- sources.append({
417
- "content": doc.page_content,
418
- "source": doc.metadata.get("source", "Unknown"),
419
- "blockchain": blockchain_info
420
- })
421
-
422
- # Calculate query time
423
- query_time = time.time() - query_start_time
424
-
425
- # Log query to blockchain if enabled
426
- blockchain_log = None
427
- if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
428
- log_result = self.blockchain.log_query(query, answer)
429
- if log_result.get("status"):
430
- blockchain_log = {
431
- "logged": True,
432
- "query_id": log_result.get("query_id", ""),
433
- "tx_hash": log_result.get("tx_hash", "")
434
- }
435
-
436
- return {
437
- "answer": answer,
438
- "sources": sources,
439
- "query_time": query_time,
440
- "blockchain_log": blockchain_log,
441
- "method": "direct"
442
- }
443
-
444
- except Exception as e:
445
- st.error(f"Error in direct retrieval: {str(e)}")
446
- return f"Error: {str(e)}"
447
-
448
- def enhanced_retrieval(self, query):
449
- """Enhanced retrieval - processes through LLM for better answers"""
450
- if not self.vector_store:
451
- return "Please upload and process PDF files first."
452
-
453
- try:
454
- # Create prompt template
455
  prompt_template = """
456
- Answer the question based on the context below.
 
 
 
457
 
458
  Context:
459
  {context}
@@ -467,20 +265,23 @@ class OptimizedRAG:
467
  input_variables=["context", "question"]
468
  )
469
 
470
- # Start timing
471
  query_start_time = time.time()
472
 
473
  # Create QA chain
 
474
  qa = RetrievalQA.from_chain_type(
475
  llm=self.llm,
476
  chain_type="stuff",
477
  retriever=self.vector_store.as_retriever(search_kwargs={"k": 4}),
478
- chain_type_kwargs={"prompt": PROMPT},
479
  return_source_documents=True
480
  )
481
 
482
  # Get answer
483
- response = qa({"query": query})
 
 
484
  answer = response["result"]
485
  source_docs = response["source_documents"]
486
 
@@ -490,7 +291,7 @@ class OptimizedRAG:
490
  # Format sources
491
  sources = []
492
  for i, doc in enumerate(source_docs):
493
- # Get blockchain info if available
494
  blockchain_info = None
495
  if "blockchain" in doc.metadata:
496
  blockchain_info = {
@@ -500,345 +301,327 @@ class OptimizedRAG:
500
  }
501
 
502
  sources.append({
503
- "content": doc.page_content,
504
  "source": doc.metadata.get("source", "Unknown"),
505
  "blockchain": blockchain_info
506
  })
507
 
508
- # Log query to blockchain if enabled
509
  blockchain_log = None
510
  if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
511
- log_result = self.blockchain.log_query(query, answer)
512
- if log_result.get("status"):
513
- blockchain_log = {
514
- "logged": True,
515
- "query_id": log_result.get("query_id", ""),
516
- "tx_hash": log_result.get("tx_hash", "")
517
- }
518
-
 
 
 
 
 
 
 
 
519
  return {
520
  "answer": answer,
521
  "sources": sources,
522
  "query_time": query_time,
523
- "blockchain_log": blockchain_log,
524
- "method": "enhanced"
525
  }
526
 
527
  except Exception as e:
528
- st.error(f"Error in enhanced retrieval: {str(e)}")
529
  return f"Error: {str(e)}"
530
 
531
- def ask(self, query, method="enhanced"):
532
- """Ask a question using the specified method"""
533
- if method == "direct":
534
- return self.direct_retrieval(query)
535
- else:
536
- return self.enhanced_retrieval(query)
 
 
 
 
 
 
 
 
 
 
537
 
538
  # Helper function to initialize session state
539
  def initialize_session_state():
540
- """Initialize Streamlit session state variables"""
541
  if "rag" not in st.session_state:
542
  st.session_state.rag = None
543
  if "messages" not in st.session_state:
544
  st.session_state.messages = []
545
  if "temp_dir" not in st.session_state:
546
  st.session_state.temp_dir = None
547
- if "wallet_connected" not in st.session_state:
548
- st.session_state.wallet_connected = False
549
- if "wallet_address" not in st.session_state:
550
- st.session_state.wallet_address = None
551
- if "retrieval_method" not in st.session_state:
552
- st.session_state.retrieval_method = "enhanced"
553
- if "current_answer" not in st.session_state:
554
- st.session_state.current_answer = None
555
 
556
  # Helper function to clean up temporary files
557
  def cleanup_temp_files():
558
- """Clean up temporary files when application exits"""
559
  if st.session_state.get('temp_dir') and os.path.exists(st.session_state.temp_dir):
560
  try:
561
  shutil.rmtree(st.session_state.temp_dir)
 
562
  except Exception as e:
563
  print(f"Error cleaning up temporary directory: {e}")
564
 
565
- # Create a simple wallet connector UI
566
- def wallet_connector():
567
- st.sidebar.subheader("πŸ”— Blockchain Connection")
568
-
569
- if st.session_state.wallet_connected:
570
- st.sidebar.success(f"βœ… Connected: {st.session_state.wallet_address[:10]}...")
571
- if st.sidebar.button("Disconnect Wallet"):
572
- st.session_state.wallet_connected = False
573
- st.session_state.wallet_address = None
574
- st.rerun()
575
- else:
576
- st.sidebar.info("Connect wallet to verify documents on blockchain")
577
- if st.sidebar.button("Connect Wallet"):
578
- # Generate a mock wallet address
579
- wallet_address = "0x" + "".join([format(i, "02x") for i in os.urandom(20)])
580
- st.session_state.wallet_address = wallet_address
581
- st.session_state.wallet_connected = True
582
-
583
- # Connect to RAG system if initialized
584
- if st.session_state.rag:
585
- st.session_state.rag.connect_wallet(wallet_address)
586
-
587
- st.rerun()
588
 
589
- # Main application UI
590
  def main():
591
- # Load CSS
592
- load_css()
 
 
593
 
594
  # Initialize session state
595
  initialize_session_state()
596
 
597
- # Page header
598
- st.title("πŸ“š Advanced RAG System")
599
- st.markdown("""
600
- <div style="display: flex; gap: 10px; margin-bottom: 20px;">
601
- <div style="background-color: #e3f2fd; padding: 5px 10px; border-radius: 15px; font-size: 0.9em;">
602
- πŸ“„ Document Analysis
603
- </div>
604
- <div style="background-color: #e8f5e9; padding: 5px 10px; border-radius: 15px; font-size: 0.9em;">
605
- πŸ”— Blockchain Verification
606
- </div>
607
- <div style="background-color: #fff3e0; padding: 5px 10px; border-radius: 15px; font-size: 0.9em;">
608
- 🎀 Voice Input
609
- </div>
610
- </div>
611
- """, unsafe_allow_html=True)
 
 
 
 
 
 
612
 
613
- # Sidebar for configuration
614
  with st.sidebar:
615
- # Wallet connector
616
- wallet_connector()
617
-
618
- # System configuration
619
- st.sidebar.subheader("βš™οΈ System Configuration")
620
 
621
  # GPU Detection
622
  gpu_available = torch.cuda.is_available()
623
  if gpu_available:
624
- st.sidebar.success(f"GPU detected and available")
 
 
 
 
 
625
  else:
626
- st.sidebar.warning("No GPU detected. Running in CPU mode.")
627
 
628
- # Model selection with faster models
629
- llm_model = st.sidebar.selectbox(
630
  "LLM Model",
631
  options=[
 
632
  "google/flan-t5-base",
633
- "google/flan-t5-small",
634
- "distilbert/distilgpt2",
635
- "google/flan-ul2"
636
  ],
637
  index=0
638
  )
639
 
640
- embedding_model = st.sidebar.selectbox(
641
  "Embedding Model",
642
  options=[
 
643
  "sentence-transformers/all-MiniLM-L6-v2",
644
- "sentence-transformers/paraphrase-MiniLM-L3-v2",
645
- "sentence-transformers/all-mpnet-base-v2"
646
  ],
647
- index=0
648
  )
649
 
650
- use_gpu = st.sidebar.checkbox("Use GPU Acceleration", value=gpu_available)
651
- use_blockchain = st.sidebar.checkbox("Enable Blockchain", value=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
- # Contract address - hardcoded for simplicity
654
- contract_address = "0x123abc..." # Your pre-deployed contract
 
 
655
 
656
  # Initialize button
657
- if st.sidebar.button("Initialize System"):
658
- with st.spinner("Setting up RAG system..."):
659
- st.session_state.rag = OptimizedRAG(
660
- llm_model_name=llm_model,
661
- embedding_model_name=embedding_model,
662
- chunk_size=1000,
663
- chunk_overlap=200,
664
- use_gpu=use_gpu and gpu_available,
665
- use_blockchain=use_blockchain,
666
- contract_address=contract_address if use_blockchain else None
667
- )
668
-
669
- # Connect wallet if already connected
670
- if st.session_state.wallet_connected:
671
- st.session_state.rag.connect_wallet(st.session_state.wallet_address)
672
-
673
- st.sidebar.success(f"βœ… System initialized!")
 
 
 
 
 
 
 
 
674
 
675
- # Document upload
676
- st.sidebar.subheader("πŸ“„ Document Upload")
677
- uploaded_files = st.sidebar.file_uploader("Select PDFs", type="pdf", accept_multiple_files=True)
678
 
679
- if uploaded_files and st.sidebar.button("Process Documents"):
680
  if not st.session_state.rag:
681
- with st.spinner("Initializing system first..."):
682
- st.session_state.rag = OptimizedRAG(
683
  llm_model_name=llm_model,
684
  embedding_model_name=embedding_model,
685
- chunk_size=1000,
686
- chunk_overlap=200,
687
  use_gpu=use_gpu and gpu_available,
688
  use_blockchain=use_blockchain,
689
  contract_address=contract_address if use_blockchain else None
690
  )
691
 
692
- # Connect wallet if already connected
693
- if st.session_state.wallet_connected:
694
- st.session_state.rag.connect_wallet(st.session_state.wallet_address)
695
 
696
  success = st.session_state.rag.process_pdfs(uploaded_files)
697
  if success:
698
- st.sidebar.success("πŸ“„ Documents processed successfully!")
699
-
700
- # Method Selection
701
- st.markdown("### Retrieval Method")
702
- col1, col2 = st.columns(2)
 
 
 
 
 
 
703
 
704
- with col1:
705
- direct_class = "method-button direct-method"
706
- if st.session_state.retrieval_method == "direct":
707
- direct_class += " method-active"
708
-
709
- if st.markdown(f"""
710
- <div class="{direct_class}" onclick="this.classList.add('method-active')">
711
- πŸ” Direct Retrieval
712
- </div>
713
- """, unsafe_allow_html=True):
714
- st.session_state.retrieval_method = "direct"
715
- st.rerun()
716
-
717
- with col2:
718
- enhanced_class = "method-button enhanced-method"
719
- if st.session_state.retrieval_method == "enhanced":
720
- enhanced_class += " method-active"
721
-
722
- if st.markdown(f"""
723
- <div class="{enhanced_class}" onclick="this.classList.add('method-active')">
724
- πŸ’‘ Enhanced Answers
725
- </div>
726
- """, unsafe_allow_html=True):
727
- st.session_state.retrieval_method = "enhanced"
728
- st.rerun()
729
-
730
- # Method description
731
- if st.session_state.retrieval_method == "direct":
732
- st.info("πŸ” **Direct Retrieval**: Shows raw document passages. Fast and transparent.")
733
- else:
734
- st.info("πŸ’‘ **Enhanced Answers**: Processes content through AI for better quality answers.")
735
 
736
- # Main Two-Column Layout
737
- answer_col, sources_col = st.columns([2, 1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
 
739
- # Answer column
740
- with answer_col:
741
- st.markdown("### Ask a Question")
 
742
 
743
- # Text input
744
- user_input = st.text_input("Enter your question about the documents")
 
745
 
746
- # Simple voice input simulation
747
- voice_toggle = st.checkbox("Enable voice input")
748
- if voice_toggle:
749
- st.markdown("""
750
- <div style="display: flex; flex-direction: column; align-items: center; margin: 15px 0;">
751
- <div class="voice-button">🎀</div>
752
- <div style="margin-top: 10px; color: #666;">Click to speak</div>
753
- </div>
754
- """, unsafe_allow_html=True)
755
-
756
- if st.button("Simulate Voice Input"):
757
- user_input = "What are the main topics covered in the documents?"
758
- st.info(f"Voice input received: {user_input}")
759
- st.rerun()
760
 
761
- # Process query
762
- if user_input:
763
- # Add user message to history
764
- st.session_state.messages.append({"role": "user", "content": user_input})
765
-
766
- # Check if system is initialized
767
- if not st.session_state.rag:
768
- st.error("Please initialize the system and process PDFs first.")
769
 
770
- # Get response if vector store is ready
771
- elif st.session_state.rag.vector_store:
772
- with st.spinner("Generating answer..."):
773
- # Get retrieval method
774
- method = st.session_state.retrieval_method
775
-
776
- # Get answer
777
- response = st.session_state.rag.ask(user_input, method=method)
778
- st.session_state.messages.append({"role": "assistant", "content": response})
779
 
780
- # Store current answer
781
- st.session_state.current_answer = response
782
 
783
- # Rerun to update UI
784
- st.rerun()
785
- else:
786
- st.error("Please upload and process PDF files first.")
787
-
788
- # Display current answer
789
- if st.session_state.current_answer and isinstance(st.session_state.current_answer, dict):
790
- answer = st.session_state.current_answer
791
-
792
- st.markdown("""
793
- <div class="answer-section">
794
- <h3>Answer</h3>
795
- <div style="white-space: pre-line;">
796
- {answer_text}
797
- </div>
798
- <div style="margin-top: 10px; font-size: 0.8em; color: #666;">
799
- Method: {method_name} | Time: {query_time:.2f}s
800
- </div>
801
- </div>
802
- """.format(
803
- answer_text=answer["answer"],
804
- method_name="Direct Retrieval" if answer["method"] == "direct" else "Enhanced Answer",
805
- query_time=answer["query_time"]
806
- ), unsafe_allow_html=True)
807
-
808
- # Blockchain verification display
809
- if "blockchain_log" in answer and answer["blockchain_log"]:
810
- blockchain_log = answer["blockchain_log"]
811
- st.success(f"βœ… Query logged on blockchain | Transaction: {blockchain_log['tx_hash'][:10]}...")
812
-
813
- # Sources column
814
- with sources_col:
815
- st.markdown("### Sources")
816
-
817
- if st.session_state.current_answer and isinstance(st.session_state.current_answer, dict):
818
- answer = st.session_state.current_answer
819
-
820
- # Display sources
821
- if "sources" in answer and answer["sources"]:
822
- for i, source in enumerate(answer["sources"]):
823
- verified_badge = ""
824
- if source.get("blockchain"):
825
- verified_badge = '<span class="verified-badge">βœ“ Verified</span>'
826
 
827
- st.markdown(f"""
828
- <div class="source-item">
829
- <div class="source-header">
830
- Source {i+1}: {source['source']}
831
- {verified_badge}
832
- </div>
833
- <div style="font-size: 0.9em;">
834
- {source['content'][:200]}...
835
- </div>
836
- </div>
837
- """, unsafe_allow_html=True)
838
- else:
839
- st.info("No sources available for this query.")
 
840
  else:
841
- st.info("Ask a question to see sources here.")
 
 
 
 
842
 
843
  # Main entry point
844
  if __name__ == "__main__":
 
1
+ # main_metamask.py
2
  import os
3
  import tempfile
4
  import shutil
5
  import PyPDF2
6
  import streamlit as st
7
  import torch
8
+ from langchain_huggingface import HuggingFaceEmbeddings
9
+ from langchain_community.llms import HuggingFaceHub
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain_community.vectorstores import FAISS
12
  from langchain.chains import RetrievalQA
13
  from langchain.docstore.document import Document
14
  from langchain.prompts import PromptTemplate
 
 
15
  import time
16
  import psutil
17
  import uuid
18
  import atexit
19
+ from blockchain_utils_metamask import BlockchainManagerMetaMask
20
+ from metamask_component import metamask_connector
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ class BlockchainEnabledRAG:
 
24
  def __init__(self,
25
+ llm_model_name="mistralai/Mistral-7B-Instruct-v0.2",
26
  embedding_model_name="sentence-transformers/all-MiniLM-L6-v2",
27
  chunk_size=1000,
28
  chunk_overlap=200,
 
30
  use_blockchain=False,
31
  contract_address=None):
32
  """
33
+ Initialize the GPU-efficient RAG system with MetaMask blockchain integration.
34
+
35
+ Args:
36
+ llm_model_name: The HuggingFace model for text generation
37
+ embedding_model_name: The HuggingFace model for embeddings
38
+ chunk_size: Size of document chunks
39
+ chunk_overlap: Overlap between chunks
40
+ use_gpu: Whether to use GPU acceleration
41
+ use_blockchain: Whether to enable blockchain verification
42
+ contract_address: Address of the deployed RAG Document Verifier contract
43
  """
44
  self.llm_model_name = llm_model_name
45
  self.embedding_model_name = embedding_model_name
 
48
 
49
  # Device selection for embeddings
50
  self.device = "cuda" if self.use_gpu else "cpu"
51
+ st.sidebar.info(f"Using device: {self.device}")
52
 
53
  # Initialize text splitter
54
  self.text_splitter = RecursiveCharacterTextSplitter(
 
63
  model_kwargs={"device": self.device}
64
  )
65
 
66
+ # Initialize LLM using HuggingFaceHub instead of Ollama
67
  try:
68
  # Use HF_TOKEN from environment variables
69
  hf_token = os.environ.get("HF_TOKEN")
 
73
  self.llm = HuggingFaceHub(
74
  repo_id=llm_model_name,
75
  huggingfacehub_api_token=hf_token,
76
+ model_kwargs={"temperature": 0.7, "max_length": 1024}
77
  )
78
  except Exception as e:
79
  st.error(f"Error initializing LLM: {str(e)}")
 
81
  # Fallback to a smaller model
82
  self.llm = HuggingFaceHub(
83
  repo_id="google/flan-t5-small",
84
+ model_kwargs={"temperature": 0.7, "max_length": 512}
85
  )
86
 
87
+ # Initialize vector store
88
  self.vector_store = None
89
  self.documents_processed = 0
90
+
91
+ # Monitoring stats
92
  self.processing_times = {}
93
 
94
+ # Initialize blockchain manager if enabled
95
  self.blockchain = None
96
  if use_blockchain:
97
+ try:
98
+ self.blockchain = BlockchainManagerMetaMask(
99
+ contract_address=contract_address
100
+ )
101
+ st.sidebar.success("Blockchain manager initialized. Please connect MetaMask to continue.")
102
+ except Exception as e:
103
+ st.sidebar.error(f"Failed to initialize blockchain manager: {str(e)}")
104
+ self.use_blockchain = False
105
 
106
+ def update_blockchain_connection(self, metamask_info):
107
+ """Update blockchain connection with MetaMask info."""
108
+ if self.blockchain and metamask_info:
109
+ self.blockchain.update_connection(
110
+ is_connected=metamask_info.get("connected", False),
111
+ user_address=metamask_info.get("address"),
112
+ network_id=metamask_info.get("network_id")
113
+ )
114
+ return self.blockchain.is_connected
115
  return False
116
 
117
  def process_pdfs(self, pdf_files):
118
+ """Process PDF files, create a vector store, and verify documents on blockchain."""
119
  all_docs = []
120
 
121
  with st.status("Processing PDF files...") as status:
122
+ # Create temporary directory for file storage
123
  temp_dir = tempfile.mkdtemp()
124
  st.session_state['temp_dir'] = temp_dir
125
 
126
+ # Monitor processing time and memory usage
127
  start_time = time.time()
128
+
129
+ # Track memory before processing
130
  mem_before = psutil.virtual_memory().used / (1024 * 1024 * 1024) # GB
131
 
132
+ # Process each PDF file
133
  for i, pdf_file in enumerate(pdf_files):
134
  try:
135
+ file_start_time = time.time()
136
+
137
+ # Save uploaded file to temp directory
138
  pdf_path = os.path.join(temp_dir, pdf_file.name)
139
  with open(pdf_path, "wb") as f:
140
  f.write(pdf_file.getbuffer())
 
151
  if page_text:
152
  text += page_text + "\n\n"
153
 
154
+ # Create documents
155
  docs = [Document(page_content=text, metadata={"source": pdf_file.name})]
156
+
157
+ # Split documents into chunks
158
  split_docs = self.text_splitter.split_documents(docs)
159
+
160
  all_docs.extend(split_docs)
161
 
162
+ # Verify document on blockchain if enabled and connected
163
  if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
164
+ try:
165
+ # Create a unique document ID
166
+ document_id = f"{pdf_file.name}_{uuid.uuid4().hex[:8]}"
167
+
168
+ # Verify document on blockchain
169
+ status.update(label=f"Verifying {pdf_file.name} on blockchain...")
170
+ verification = self.blockchain.verify_document(document_id, pdf_path)
171
 
172
+ if verification.get('status'): # Success
173
+ st.sidebar.success(f"βœ… {pdf_file.name} verified on blockchain")
174
+ if 'tx_hash' in verification:
175
+ st.sidebar.info(f"Transaction: {verification['tx_hash'][:10]}...")
 
 
 
 
 
176
 
177
+ # Add blockchain metadata to documents
178
+ for doc in split_docs:
179
+ doc.metadata["blockchain"] = {
180
+ "verified": True,
181
+ "document_id": document_id,
182
+ "document_hash": verification.get("document_hash", ""),
183
+ "tx_hash": verification.get("tx_hash", ""),
184
+ "block_number": verification.get("block_number", 0)
185
+ }
186
+ else:
187
+ st.sidebar.warning(f"❌ Failed to verify {pdf_file.name} on blockchain")
188
+ if 'error' in verification:
189
+ st.sidebar.error(f"Error: {verification['error']}")
190
+ except Exception as e:
191
+ st.sidebar.error(f"Blockchain verification error: {str(e)}")
192
+ elif self.use_blockchain:
193
+ st.sidebar.warning("MetaMask not connected. Document not verified on blockchain.")
194
+
195
+ file_end_time = time.time()
196
+ processing_time = file_end_time - file_start_time
197
+
198
+ st.sidebar.success(f"Processed {pdf_file.name}: {len(split_docs)} chunks in {processing_time:.2f}s")
199
+ self.processing_times[pdf_file.name] = {
200
+ "chunks": len(split_docs),
201
+ "time": processing_time
202
+ }
203
+
204
  except Exception as e:
205
  st.sidebar.error(f"Error processing {pdf_file.name}: {str(e)}")
206
 
207
+ # Create vector store if we have documents
208
  if all_docs:
209
  status.update(label="Building vector index...")
210
  try:
211
+ # Record the time taken to build the index
212
  index_start_time = time.time()
213
+
214
+ # Create the vector store using FAISS
215
  self.vector_store = FAISS.from_documents(all_docs, self.embeddings)
 
216
 
217
+ index_end_time = time.time()
218
+ index_time = index_end_time - index_start_time
219
+
220
+ # Track memory after processing
221
+ mem_after = psutil.virtual_memory().used / (1024 * 1024 * 1024) # GB
222
  mem_used = mem_after - mem_before
223
 
 
224
  total_time = time.time() - start_time
225
+
226
+ status.update(label=f"Completed processing {len(all_docs)} chunks in {total_time:.2f}s", state="complete")
227
+
228
+ # Save performance metrics
229
  self.processing_times["index_building"] = index_time
230
  self.processing_times["total_time"] = total_time
231
  self.processing_times["memory_used_gb"] = mem_used
232
  self.documents_processed = len(all_docs)
233
 
 
234
  return True
235
  except Exception as e:
236
  st.error(f"Error creating vector store: {str(e)}")
237
+ status.update(label="Error creating vector store", state="error")
238
  return False
239
  else:
240
  status.update(label="No content extracted from PDFs", state="error")
241
  return False
242
 
243
+ def ask(self, query):
244
+ """Ask a question and get an answer based on the PDFs with blockchain logging."""
245
  if not self.vector_store:
246
  return "Please upload and process PDF files first."
247
 
248
  try:
249
+ # Custom prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  prompt_template = """
251
+ You are an AI assistant that provides accurate information based on PDF documents.
252
+
253
+ Use the following context to answer the question. Be detailed and precise in your answer.
254
+ If the answer is not in the context, say "I don't have enough information to answer this question."
255
 
256
  Context:
257
  {context}
 
265
  input_variables=["context", "question"]
266
  )
267
 
268
+ # Start timing the query
269
  query_start_time = time.time()
270
 
271
  # Create QA chain
272
+ chain_type_kwargs = {"prompt": PROMPT}
273
  qa = RetrievalQA.from_chain_type(
274
  llm=self.llm,
275
  chain_type="stuff",
276
  retriever=self.vector_store.as_retriever(search_kwargs={"k": 4}),
277
+ chain_type_kwargs=chain_type_kwargs,
278
  return_source_documents=True
279
  )
280
 
281
  # Get answer
282
+ with st.status("Searching documents and generating answer..."):
283
+ response = qa({"query": query})
284
+
285
  answer = response["result"]
286
  source_docs = response["source_documents"]
287
 
 
291
  # Format sources
292
  sources = []
293
  for i, doc in enumerate(source_docs):
294
+ # Extract blockchain verification info if available
295
  blockchain_info = None
296
  if "blockchain" in doc.metadata:
297
  blockchain_info = {
 
301
  }
302
 
303
  sources.append({
304
+ "content": doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content,
305
  "source": doc.metadata.get("source", "Unknown"),
306
  "blockchain": blockchain_info
307
  })
308
 
309
+ # Log query to blockchain if enabled and connected
310
  blockchain_log = None
311
  if self.use_blockchain and self.blockchain and self.blockchain.is_connected:
312
+ try:
313
+ with st.status("Logging query to blockchain..."):
314
+ log_result = self.blockchain.log_query(query, answer)
315
+
316
+ if log_result.get("status"): # Success
317
+ blockchain_log = {
318
+ "logged": True,
319
+ "query_id": log_result.get("query_id", ""),
320
+ "tx_hash": log_result.get("tx_hash", ""),
321
+ "block_number": log_result.get("block_number", 0)
322
+ }
323
+ else:
324
+ st.error(f"Error logging to blockchain: {log_result.get('error', 'Unknown error')}")
325
+ except Exception as e:
326
+ st.error(f"Error logging to blockchain: {str(e)}")
327
+
328
  return {
329
  "answer": answer,
330
  "sources": sources,
331
  "query_time": query_time,
332
+ "blockchain_log": blockchain_log
 
333
  }
334
 
335
  except Exception as e:
336
+ st.error(f"Error generating answer: {str(e)}")
337
  return f"Error: {str(e)}"
338
 
339
+ def get_performance_metrics(self):
340
+ """Return performance metrics for the RAG system."""
341
+ if not self.processing_times:
342
+ return None
343
+
344
+ return {
345
+ "documents_processed": self.documents_processed,
346
+ "index_building_time": self.processing_times.get("index_building", 0),
347
+ "total_processing_time": self.processing_times.get("total_time", 0),
348
+ "memory_used_gb": self.processing_times.get("memory_used_gb", 0),
349
+ "device": self.device,
350
+ "embedding_model": self.embedding_model_name,
351
+ "blockchain_enabled": self.use_blockchain,
352
+ "blockchain_connected": self.blockchain.is_connected if self.blockchain else False
353
+ }
354
+
355
 
356
  # Helper function to initialize session state
357
  def initialize_session_state():
358
+ """Initialize Streamlit session state variables."""
359
  if "rag" not in st.session_state:
360
  st.session_state.rag = None
361
  if "messages" not in st.session_state:
362
  st.session_state.messages = []
363
  if "temp_dir" not in st.session_state:
364
  st.session_state.temp_dir = None
365
+ if "metamask_connected" not in st.session_state:
366
+ st.session_state.metamask_connected = False
 
 
 
 
 
 
367
 
368
  # Helper function to clean up temporary files
369
  def cleanup_temp_files():
370
+ """Clean up temporary files when application exits."""
371
  if st.session_state.get('temp_dir') and os.path.exists(st.session_state.temp_dir):
372
  try:
373
  shutil.rmtree(st.session_state.temp_dir)
374
+ print(f"Cleaned up temporary directory: {st.session_state.temp_dir}")
375
  except Exception as e:
376
  print(f"Error cleaning up temporary directory: {e}")
377
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
379
+ # Streamlit UI
380
  def main():
381
+ st.set_page_config(page_title="Blockchain-Enabled RAG System", layout="wide")
382
+
383
+ st.title("πŸš€ GPU-Accelerated PDF Question Answering with MetaMask Blockchain Verification")
384
+ st.markdown("Upload PDFs, verify them on blockchain with MetaMask, and ask questions with audit log")
385
 
386
  # Initialize session state
387
  initialize_session_state()
388
 
389
+ # MetaMask Connection Section
390
+ st.header("🦊 MetaMask Connection")
391
+ st.markdown("Connect your MetaMask wallet to verify documents and log queries on the blockchain.")
392
+
393
+ # Add MetaMask connector and get connection info
394
+ metamask_info = metamask_connector()
395
+
396
+ # Display MetaMask connection status
397
+ if metamask_info and metamask_info.get("connected"):
398
+ st.success(f"βœ… MetaMask Connected: {metamask_info.get('address')}")
399
+ st.info(f"Network: {metamask_info.get('network_name')}")
400
+ st.session_state.metamask_connected = True
401
+ else:
402
+ st.warning("⚠️ MetaMask not connected. Please connect your wallet to use blockchain features.")
403
+ st.session_state.metamask_connected = False
404
+
405
+ # Update RAG system with MetaMask connection if needed
406
+ if st.session_state.rag and metamask_info:
407
+ is_connected = st.session_state.rag.update_blockchain_connection(metamask_info)
408
+ if is_connected:
409
+ st.success("RAG system updated with MetaMask connection")
410
 
411
+ # Sidebar for configuration and file upload
412
  with st.sidebar:
413
+ st.header("βš™οΈ Configuration")
 
 
 
 
414
 
415
  # GPU Detection
416
  gpu_available = torch.cuda.is_available()
417
  if gpu_available:
418
+ try:
419
+ gpu_info = torch.cuda.get_device_properties(0)
420
+ st.success(f"GPU detected: {gpu_info.name} ({gpu_info.total_memory / 1024**3:.1f} GB)")
421
+ except Exception as e:
422
+ st.warning(f"GPU detected but couldn't get properties: {str(e)}")
423
+ st.info("Running with limited GPU information")
424
  else:
425
+ st.warning("No GPU detected. Running in CPU mode.")
426
 
427
+ # Model selection
428
+ llm_model = st.selectbox(
429
  "LLM Model",
430
  options=[
431
+ "mistralai/Mistral-7B-Instruct-v0.2",
432
  "google/flan-t5-base",
433
+ "tiiuae/falcon-7b-instruct"
 
 
434
  ],
435
  index=0
436
  )
437
 
438
+ embedding_model = st.selectbox(
439
  "Embedding Model",
440
  options=[
441
+ "sentence-transformers/all-mpnet-base-v2",
442
  "sentence-transformers/all-MiniLM-L6-v2",
443
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
 
444
  ],
445
+ index=1 # all-MiniLM-L6-v2 is smaller and faster
446
  )
447
 
448
+ use_gpu = st.checkbox("Use GPU Acceleration", value=gpu_available)
449
+
450
+ # Blockchain configuration
451
+ st.header("πŸ”— Blockchain Configuration")
452
+ use_blockchain = st.checkbox("Enable Blockchain Verification", value=True)
453
+
454
+ if use_blockchain:
455
+ contract_address = st.text_input("Contract Address",
456
+ value="0x0000000000000000000000000000000000000000")
457
+
458
+ # Display MetaMask connection status in sidebar
459
+ if metamask_info and metamask_info.get("connected"):
460
+ st.success(f"βœ… MetaMask Connected: {metamask_info.get('address')[:10]}...")
461
+ else:
462
+ st.warning("⚠️ MetaMask not connected. Please connect your wallet above.")
463
+
464
+ if not contract_address or contract_address == "0x0000000000000000000000000000000000000000":
465
+ st.error("Please deploy the contract and enter its address")
466
 
467
+ # Advanced options
468
+ with st.expander("Advanced Options"):
469
+ chunk_size = st.slider("Chunk Size", 100, 2000, 1000)
470
+ chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200)
471
 
472
  # Initialize button
473
+ if st.button("Initialize System"):
474
+ with st.spinner("Initializing RAG system..."):
475
+ if use_blockchain and not contract_address:
476
+ st.error("Contract address is required for blockchain integration")
477
+ else:
478
+ st.session_state.rag = BlockchainEnabledRAG(
479
+ llm_model_name=llm_model,
480
+ embedding_model_name=embedding_model,
481
+ chunk_size=chunk_size,
482
+ chunk_overlap=chunk_overlap,
483
+ use_gpu=use_gpu and gpu_available,
484
+ use_blockchain=use_blockchain,
485
+ contract_address=contract_address if use_blockchain else None
486
+ )
487
+
488
+ # Update with current MetaMask connection if available
489
+ if use_blockchain and metamask_info:
490
+ st.session_state.rag.update_blockchain_connection(metamask_info)
491
+
492
+ st.success(f"System initialized with {embedding_model} on {st.session_state.rag.device}")
493
+ if use_blockchain:
494
+ if metamask_info and metamask_info.get("connected"):
495
+ st.success("Blockchain verification enabled with MetaMask")
496
+ else:
497
+ st.warning("Blockchain verification enabled but MetaMask not connected")
498
 
499
+ st.header("πŸ“„ Upload Documents")
500
+ uploaded_files = st.file_uploader("Select PDFs", type="pdf", accept_multiple_files=True)
 
501
 
502
+ if uploaded_files and st.button("Process PDFs"):
503
  if not st.session_state.rag:
504
+ with st.spinner("Initializing RAG system..."):
505
+ st.session_state.rag = BlockchainEnabledRAG(
506
  llm_model_name=llm_model,
507
  embedding_model_name=embedding_model,
508
+ chunk_size=chunk_size,
509
+ chunk_overlap=chunk_overlap,
510
  use_gpu=use_gpu and gpu_available,
511
  use_blockchain=use_blockchain,
512
  contract_address=contract_address if use_blockchain else None
513
  )
514
 
515
+ # Update with current MetaMask connection if available
516
+ if use_blockchain and metamask_info:
517
+ st.session_state.rag.update_blockchain_connection(metamask_info)
518
 
519
  success = st.session_state.rag.process_pdfs(uploaded_files)
520
  if success:
521
+ metrics = st.session_state.rag.get_performance_metrics()
522
+ if metrics:
523
+ st.success("PDFs processed successfully!")
524
+ with st.expander("πŸ’Ή Performance Metrics"):
525
+ st.markdown(f"**Documents processed:** {metrics['documents_processed']} chunks")
526
+ st.markdown(f"**Index building time:** {metrics['index_building_time']:.2f} seconds")
527
+ st.markdown(f"**Total processing time:** {metrics['total_processing_time']:.2f} seconds")
528
+ st.markdown(f"**Memory used:** {metrics['memory_used_gb']:.2f} GB")
529
+ st.markdown(f"**Device used:** {metrics['device']}")
530
+ st.markdown(f"**Blockchain verification:** {'Enabled' if metrics['blockchain_enabled'] else 'Disabled'}")
531
+ st.markdown(f"**Blockchain connected:** {'Yes' if metrics.get('blockchain_connected') else 'No'}")
532
 
533
+ # Blockchain verification info
534
+ if st.session_state.rag and st.session_state.rag.use_blockchain:
535
+ if st.session_state.metamask_connected:
536
+ st.info("πŸ”— Blockchain verification is enabled with MetaMask. Documents are cryptographically verified and queries are logged with immutable audit trail.")
537
+ else:
538
+ st.warning("πŸ”— Blockchain verification is enabled but MetaMask is not connected. Please connect your MetaMask wallet to use blockchain features.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
+ # Display chat messages
541
+ for message in st.session_state.messages:
542
+ with st.chat_message(message["role"]):
543
+ if message["role"] == "user":
544
+ st.markdown(message["content"])
545
+ else:
546
+ if isinstance(message["content"], dict):
547
+ st.markdown(message["content"]["answer"])
548
+
549
+ if "query_time" in message["content"]:
550
+ st.caption(f"Response time: {message['content']['query_time']:.2f} seconds")
551
+
552
+ # Display blockchain log if available
553
+ if "blockchain_log" in message["content"] and message["content"]["blockchain_log"]:
554
+ blockchain_log = message["content"]["blockchain_log"]
555
+ st.success(f"βœ… Query logged on blockchain | Transaction: {blockchain_log['tx_hash'][:10]}...")
556
+
557
+ # Display sources in expander
558
+ if "sources" in message["content"] and message["content"]["sources"]:
559
+ with st.expander("πŸ“„ View Sources"):
560
+ for i, source in enumerate(message["content"]["sources"]):
561
+ st.markdown(f"**Source {i+1}: {source['source']}**")
562
+
563
+ # Show blockchain verification if available
564
+ if source.get("blockchain"):
565
+ st.success(f"βœ… Verified on blockchain | TX: {source['blockchain']['tx_hash'][:10]}...")
566
+
567
+ st.text(source["content"])
568
+ st.divider()
569
+ else:
570
+ st.markdown(message["content"])
571
 
572
+ # Chat input
573
+ if prompt := st.chat_input("Ask a question about your PDFs..."):
574
+ # Add user message to chat
575
+ st.session_state.messages.append({"role": "user", "content": prompt})
576
 
577
+ # Display user message
578
+ with st.chat_message("user"):
579
+ st.markdown(prompt)
580
 
581
+ # Check if system is initialized
582
+ if not st.session_state.rag:
583
+ with st.chat_message("assistant"):
584
+ message = "Please initialize the system and process PDFs first."
585
+ st.markdown(message)
586
+ st.session_state.messages.append({"role": "assistant", "content": message})
 
 
 
 
 
 
 
 
587
 
588
+ # Get response if vector store is ready
589
+ elif st.session_state.rag.vector_store:
590
+ with st.chat_message("assistant"):
591
+ response = st.session_state.rag.ask(prompt)
592
+ st.session_state.messages.append({"role": "assistant", "content": response})
 
 
 
593
 
594
+ if isinstance(response, dict):
595
+ st.markdown(response["answer"])
 
 
 
 
 
 
 
596
 
597
+ if "query_time" in response:
598
+ st.caption(f"Response time: {response['query_time']:.2f} seconds")
599
 
600
+ # Display blockchain log if available
601
+ if "blockchain_log" in response and response["blockchain_log"]:
602
+ blockchain_log = response["blockchain_log"]
603
+ st.success(f"βœ… Query logged on blockchain | Transaction: {blockchain_log['tx_hash'][:10]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
+ # Display sources in expander
606
+ if "sources" in response and response["sources"]:
607
+ with st.expander("πŸ“„ View Sources"):
608
+ for i, source in enumerate(response["sources"]):
609
+ st.markdown(f"**Source {i+1}: {source['source']}**")
610
+
611
+ # Show blockchain verification if available
612
+ if source.get("blockchain"):
613
+ st.success(f"βœ… Verified on blockchain | TX: {source['blockchain']['tx_hash'][:10]}...")
614
+
615
+ st.text(source["content"])
616
+ st.divider()
617
+ else:
618
+ st.markdown(response)
619
  else:
620
+ with st.chat_message("assistant"):
621
+ message = "Please upload and process PDF files first."
622
+ st.markdown(message)
623
+ st.session_state.messages.append({"role": "assistant", "content": message})
624
+
625
 
626
  # Main entry point
627
  if __name__ == "__main__":