abdull4h commited on
Commit
39d753a
·
verified ·
1 Parent(s): 4cf7bd8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -106
app.py CHANGED
@@ -1,5 +1,5 @@
1
- # Vision 2030 Virtual Assistant with RAG and Evaluation Framework
2
- # Modified for Hugging Face Spaces compatibility with GPU support
3
 
4
  import gradio as gr
5
  import time
@@ -12,6 +12,7 @@ import pandas as pd
12
  import matplotlib.pyplot as plt
13
  from sklearn.metrics import precision_recall_fscore_support, accuracy_score
14
  import PyPDF2
 
15
  import json
16
  from langdetect import detect
17
  from sentence_transformers import SentenceTransformer
@@ -29,42 +30,29 @@ logging.basicConfig(
29
  )
30
  logger = logging.getLogger('vision2030_assistant')
31
 
32
- # Check for GPU availability
33
  has_gpu = torch.cuda.is_available()
34
  logger.info(f"GPU available: {has_gpu}")
35
 
36
  class Vision2030Assistant:
37
- def __init__(self, pdf_path=None, eval_data_path=None):
38
- """
39
- Initialize the Vision 2030 Assistant with embedding models and evaluation framework
40
-
41
- Args:
42
- pdf_path: Path to the Vision 2030 PDF document
43
- eval_data_path: Path to evaluation dataset
44
- """
45
  logger.info("Initializing Vision 2030 Assistant...")
46
 
47
- # Initialize embedding models only (no LLMs to avoid tokenizer issues)
48
  self.load_embedding_models()
49
 
50
- # Load documents
51
- if pdf_path and os.path.exists(pdf_path):
52
- self.load_and_process_documents(pdf_path)
53
- else:
54
- self._create_sample_data()
55
- self._create_indices()
56
-
57
- # Setup evaluation framework
58
- if eval_data_path and os.path.exists(eval_data_path):
59
- with open(eval_data_path, 'r', encoding='utf-8') as f:
60
- self.eval_data = json.load(f)
61
- else:
62
- self._create_sample_eval_data()
63
-
64
  self.metrics = {
65
  "response_times": [],
66
  "user_ratings": [],
67
- "retrieval_precision": [],
68
  "factual_accuracy": []
69
  }
70
  self.response_history = []
@@ -73,7 +61,7 @@ class Vision2030Assistant:
73
  @spaces.GPU
74
  def load_embedding_models(self):
75
  """Load embedding models for retrieval with GPU support"""
76
- logger.info("Loading embedding models with GPU support...")
77
 
78
  try:
79
  # Load embedding models
@@ -89,7 +77,7 @@ class Vision2030Assistant:
89
  logger.info("Embedding models loaded successfully")
90
  except Exception as e:
91
  logger.error(f"Error loading embedding models: {str(e)}")
92
- # Create simple placeholder models if loading fails
93
  self._create_fallback_embedders()
94
 
95
  def _create_fallback_embedders(self):
@@ -118,51 +106,8 @@ class Vision2030Assistant:
118
  self.arabic_embedder = SimpleEmbedder()
119
  self.english_embedder = SimpleEmbedder()
120
 
121
- def load_and_process_documents(self, pdf_path):
122
- """Load and process the Vision 2030 document from PDF"""
123
- logger.info(f"Processing Vision 2030 document from {pdf_path}")
124
-
125
- # Initialize empty document lists
126
- self.english_texts = []
127
- self.arabic_texts = []
128
-
129
- try:
130
- # Extract text from PDF
131
- with open(pdf_path, 'rb') as file:
132
- reader = PyPDF2.PdfReader(file)
133
- full_text = ""
134
- for page_num in range(len(reader.pages)):
135
- page = reader.pages[page_num]
136
- full_text += page.extract_text() + "\n"
137
-
138
- # Split into chunks (simple approach - could be improved with better text segmentation)
139
- chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
140
-
141
- # Detect language and add to appropriate list
142
- for chunk in chunks:
143
- try:
144
- lang = detect(chunk)
145
- if lang == "ar":
146
- self.arabic_texts.append(chunk)
147
- else: # Default to English for other languages
148
- self.english_texts.append(chunk)
149
- except:
150
- # If language detection fails, assume English
151
- self.english_texts.append(chunk)
152
-
153
- logger.info(f"Processed {len(self.arabic_texts)} Arabic and {len(self.english_texts)} English chunks")
154
-
155
- # Create FAISS indices
156
- self._create_indices()
157
-
158
- except Exception as e:
159
- logger.error(f"Error processing PDF: {str(e)}")
160
- logger.info("Using fallback sample data")
161
- self._create_sample_data()
162
- self._create_indices()
163
-
164
  def _create_sample_data(self):
165
- """Create sample Vision 2030 data if PDF processing fails"""
166
  logger.info("Creating sample Vision 2030 data")
167
 
168
  # English sample texts
@@ -195,7 +140,7 @@ class Vision2030Assistant:
195
 
196
  @spaces.GPU
197
  def _create_indices(self):
198
- """Create FAISS indices for fast text retrieval with GPU support"""
199
  logger.info("Creating FAISS indices for text retrieval")
200
 
201
  try:
@@ -275,23 +220,13 @@ class Vision2030Assistant:
275
  "question": "ما هو مشروع البحر الأحمر؟",
276
  "lang": "ar",
277
  "reference_answer": "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
278
- },
279
- {
280
- "question": "What are the goals for women's workforce participation?",
281
- "lang": "en",
282
- "reference_answer": "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
283
- },
284
- {
285
- "question": "ما هي القدية؟",
286
- "lang": "ar",
287
- "reference_answer": "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030."
288
  }
289
  ]
290
  logger.info(f"Created {len(self.eval_data)} sample evaluation examples")
291
 
292
  @spaces.GPU
293
  def retrieve_context(self, query, lang):
294
- """Retrieve relevant context for a query based on language with GPU support"""
295
  start_time = time.time()
296
 
297
  try:
@@ -323,7 +258,10 @@ class Vision2030Assistant:
323
  return ""
324
 
325
  def generate_response(self, user_input):
326
- """Generate a response to user input using retrieval and predefined responses for evaluation"""
 
 
 
327
  start_time = time.time()
328
 
329
  # Default response in case of failure
@@ -346,7 +284,7 @@ class Vision2030Assistant:
346
  # Retrieve relevant context
347
  context = self.retrieve_context(user_input, lang)
348
 
349
- # Simplified response generation for HF Spaces
350
  if lang == "ar":
351
  if "ركائز" in user_input or "اركان" in user_input:
352
  reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
@@ -358,6 +296,8 @@ class Vision2030Assistant:
358
  reply = "تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪."
359
  elif "القدية" in user_input:
360
  reply = "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030."
 
 
361
  else:
362
  # Use the retrieved context directly if available
363
  reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
@@ -372,6 +312,8 @@ class Vision2030Assistant:
372
  reply = "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
373
  elif "qiddiya" in user_input.lower():
374
  reply = "Qiddiya is a entertainment mega-project being built in Riyadh as part of Vision 2030."
 
 
375
  else:
376
  # Use the retrieved context directly if available
377
  reply = context if context else "I couldn't find enough information about this question."
@@ -422,7 +364,7 @@ class Vision2030Assistant:
422
 
423
  @spaces.GPU
424
  def evaluate_on_test_set(self):
425
- """Evaluate the assistant on the test set with GPU support"""
426
  logger.info("Running evaluation on test set")
427
 
428
  eval_results = []
@@ -516,14 +458,70 @@ class Vision2030Assistant:
516
 
517
  return True
518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  # Create the Gradio interface
520
- def create_gradio_interface():
521
  try:
522
  # Initialize the assistant
523
  assistant = Vision2030Assistant()
524
 
525
  def chat(message, history):
526
- if not message.strip():
527
  return history, ""
528
 
529
  # Generate response
@@ -568,15 +566,6 @@ def create_gradio_interface():
568
  fig = assistant.visualize_evaluation_results(results)
569
 
570
  return summary, fig
571
-
572
- @spaces.GPU
573
- def process_uploaded_file(file):
574
- if file is not None:
575
- # Create a new assistant with the uploaded PDF
576
- global assistant
577
- assistant = Vision2030Assistant(pdf_path=file.name)
578
- return f"Successfully processed {file.name}. The assistant is ready to use."
579
- return "No file uploaded. Using sample data."
580
 
581
  # Create the Gradio interface
582
  with gr.Blocks() as demo:
@@ -603,9 +592,34 @@ def create_gradio_interface():
603
  eval_chart = gr.Plot(label="Evaluation Metrics")
604
 
605
  with gr.Tab("Upload PDF"):
606
- file_input = gr.File(label="Upload Vision 2030 PDF")
607
- upload_result = gr.Textbox(label="Upload Status")
608
- upload_btn = gr.Button("Process PDF")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
609
 
610
  # Set up event handlers
611
  msg.submit(chat, [msg, chatbot], [chatbot, msg])
@@ -613,18 +627,19 @@ def create_gradio_interface():
613
  clear_btn.click(lambda: [], None, chatbot)
614
  feedback_btn.click(provide_feedback, [chatbot, rating, feedback_text], feedback_result)
615
  evaluate_btn.click(run_evaluation, None, [eval_output, eval_chart])
616
- upload_btn.click(process_uploaded_file, [file_input], upload_result)
617
 
618
  return demo
 
619
  except Exception as e:
620
  logger.error(f"Error creating Gradio interface: {str(e)}")
621
- # Create a simple demo for fallback
622
  with gr.Blocks() as demo:
623
  gr.Markdown("# Vision 2030 Virtual Assistant")
624
  gr.Markdown("There was an error initializing the assistant. Please check the logs.")
625
  gr.Markdown(f"Error: {str(e)}")
626
  return demo
627
 
628
- # Launch the app with proper GPU initialization
629
- demo = create_gradio_interface()
630
  demo.launch()
 
1
+ # Minimal version for Hugging Face Spaces
2
+ # File: app.py
3
 
4
  import gradio as gr
5
  import time
 
12
  import matplotlib.pyplot as plt
13
  from sklearn.metrics import precision_recall_fscore_support, accuracy_score
14
  import PyPDF2
15
+ import io
16
  import json
17
  from langdetect import detect
18
  from sentence_transformers import SentenceTransformer
 
30
  )
31
  logger = logging.getLogger('vision2030_assistant')
32
 
33
+ # Check for GPU availability (but don't rely on it)
34
  has_gpu = torch.cuda.is_available()
35
  logger.info(f"GPU available: {has_gpu}")
36
 
37
  class Vision2030Assistant:
38
+ def __init__(self):
39
+ """Initialize the Vision 2030 Assistant with embedding models and sample data"""
 
 
 
 
 
 
40
  logger.info("Initializing Vision 2030 Assistant...")
41
 
42
+ # Initialize embedding models
43
  self.load_embedding_models()
44
 
45
+ # Use sample data to start
46
+ self._create_sample_data()
47
+ self._create_indices()
48
+
49
+ # Create sample evaluation data
50
+ self._create_sample_eval_data()
51
+
52
+ # Initialize metrics
 
 
 
 
 
 
53
  self.metrics = {
54
  "response_times": [],
55
  "user_ratings": [],
 
56
  "factual_accuracy": []
57
  }
58
  self.response_history = []
 
61
  @spaces.GPU
62
  def load_embedding_models(self):
63
  """Load embedding models for retrieval with GPU support"""
64
+ logger.info("Loading embedding models...")
65
 
66
  try:
67
  # Load embedding models
 
77
  logger.info("Embedding models loaded successfully")
78
  except Exception as e:
79
  logger.error(f"Error loading embedding models: {str(e)}")
80
+ # Create simple fallback embedding method
81
  self._create_fallback_embedders()
82
 
83
  def _create_fallback_embedders(self):
 
106
  self.arabic_embedder = SimpleEmbedder()
107
  self.english_embedder = SimpleEmbedder()
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  def _create_sample_data(self):
110
+ """Create sample Vision 2030 data"""
111
  logger.info("Creating sample Vision 2030 data")
112
 
113
  # English sample texts
 
140
 
141
  @spaces.GPU
142
  def _create_indices(self):
143
+ """Create FAISS indices for fast text retrieval"""
144
  logger.info("Creating FAISS indices for text retrieval")
145
 
146
  try:
 
220
  "question": "ما هو مشروع البحر الأحمر؟",
221
  "lang": "ar",
222
  "reference_answer": "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
 
 
 
 
 
 
 
 
 
 
223
  }
224
  ]
225
  logger.info(f"Created {len(self.eval_data)} sample evaluation examples")
226
 
227
  @spaces.GPU
228
  def retrieve_context(self, query, lang):
229
+ """Retrieve relevant context for a query based on language"""
230
  start_time = time.time()
231
 
232
  try:
 
258
  return ""
259
 
260
  def generate_response(self, user_input):
261
+ """Generate a response to user input using retrieval and predefined responses"""
262
+ if not user_input or user_input.strip() == "":
263
+ return ""
264
+
265
  start_time = time.time()
266
 
267
  # Default response in case of failure
 
284
  # Retrieve relevant context
285
  context = self.retrieve_context(user_input, lang)
286
 
287
+ # Simplified response generation
288
  if lang == "ar":
289
  if "ركائز" in user_input or "اركان" in user_input:
290
  reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
 
296
  reply = "تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪."
297
  elif "القدية" in user_input:
298
  reply = "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030."
299
+ elif "ماهي" in user_input or "ما هي" in user_input:
300
+ reply = "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة. الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
301
  else:
302
  # Use the retrieved context directly if available
303
  reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
 
312
  reply = "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
313
  elif "qiddiya" in user_input.lower():
314
  reply = "Qiddiya is a entertainment mega-project being built in Riyadh as part of Vision 2030."
315
+ elif "what is" in user_input.lower():
316
+ reply = "Vision 2030 is Saudi Arabia's strategic framework to reduce dependence on oil, diversify the economy, and develop public sectors. The key pillars are a vibrant society, a thriving economy, and an ambitious nation."
317
  else:
318
  # Use the retrieved context directly if available
319
  reply = context if context else "I couldn't find enough information about this question."
 
364
 
365
  @spaces.GPU
366
  def evaluate_on_test_set(self):
367
+ """Evaluate the assistant on the test set"""
368
  logger.info("Running evaluation on test set")
369
 
370
  eval_results = []
 
458
 
459
  return True
460
 
461
+ @spaces.GPU
462
+ def process_uploaded_pdf(self, file):
463
+ """Process uploaded PDF and extract text content"""
464
+ if file is None:
465
+ return "No file uploaded. Please select a PDF file."
466
+
467
+ try:
468
+ logger.info(f"Processing uploaded file")
469
+
470
+ # Use PyPDF2 to read the file content directly
471
+ reader = PyPDF2.PdfReader(file)
472
+
473
+ # Extract text from the PDF
474
+ full_text = ""
475
+ for page_num in range(len(reader.pages)):
476
+ page = reader.pages[page_num]
477
+ extracted_text = page.extract_text()
478
+ if extracted_text:
479
+ full_text += extracted_text + "\n"
480
+
481
+ if not full_text.strip():
482
+ return "The uploaded PDF doesn't contain extractable text. Please try another file."
483
+
484
+ # Process the extracted text
485
+ chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
486
+
487
+ # Categorize text by language
488
+ english_chunks = []
489
+ arabic_chunks = []
490
+
491
+ for chunk in chunks:
492
+ try:
493
+ lang = detect(chunk)
494
+ if lang == "ar":
495
+ arabic_chunks.append(chunk)
496
+ else:
497
+ english_chunks.append(chunk)
498
+ except:
499
+ # If language detection fails, assume English
500
+ english_chunks.append(chunk)
501
+
502
+ # Update the assistant's knowledge base
503
+ self.english_texts = english_chunks
504
+ self.arabic_texts = arabic_chunks
505
+
506
+ # Recreate indices
507
+ self._create_indices()
508
+
509
+ logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
510
+
511
+ return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments."
512
+
513
+ except Exception as e:
514
+ logger.error(f"Error processing PDF: {str(e)}")
515
+ return f"❌ Error processing the PDF: {str(e)}. Please try another file."
516
+
517
  # Create the Gradio interface
518
+ def create_interface():
519
  try:
520
  # Initialize the assistant
521
  assistant = Vision2030Assistant()
522
 
523
  def chat(message, history):
524
+ if not message or message.strip() == "":
525
  return history, ""
526
 
527
  # Generate response
 
566
  fig = assistant.visualize_evaluation_results(results)
567
 
568
  return summary, fig
 
 
 
 
 
 
 
 
 
569
 
570
  # Create the Gradio interface
571
  with gr.Blocks() as demo:
 
592
  eval_chart = gr.Plot(label="Evaluation Metrics")
593
 
594
  with gr.Tab("Upload PDF"):
595
+ gr.Markdown("""
596
+ ### Upload a Vision 2030 PDF Document
597
+ Upload a PDF document to enhance the assistant's knowledge base.
598
+ """)
599
+
600
+ with gr.Row():
601
+ file_input = gr.File(
602
+ label="Select PDF File",
603
+ file_types=[".pdf"],
604
+ type="binary" # Important: Use binary mode
605
+ )
606
+
607
+ with gr.Row():
608
+ upload_btn = gr.Button("Process PDF", variant="primary")
609
+
610
+ with gr.Row():
611
+ upload_status = gr.Textbox(
612
+ label="Upload Status",
613
+ placeholder="Upload status will appear here...",
614
+ interactive=False
615
+ )
616
+
617
+ gr.Markdown("""
618
+ ### Notes:
619
+ - The PDF should contain text that can be extracted (not scanned images)
620
+ - After uploading, you can return to the Chat tab to ask questions about the uploaded content
621
+ - If no PDF is uploaded, the assistant will use default Vision 2030 information
622
+ """)
623
 
624
  # Set up event handlers
625
  msg.submit(chat, [msg, chatbot], [chatbot, msg])
 
627
  clear_btn.click(lambda: [], None, chatbot)
628
  feedback_btn.click(provide_feedback, [chatbot, rating, feedback_text], feedback_result)
629
  evaluate_btn.click(run_evaluation, None, [eval_output, eval_chart])
630
+ upload_btn.click(assistant.process_uploaded_pdf, [file_input], [upload_status])
631
 
632
  return demo
633
+
634
  except Exception as e:
635
  logger.error(f"Error creating Gradio interface: {str(e)}")
636
+ # Create a simple fallback demo if there's an error
637
  with gr.Blocks() as demo:
638
  gr.Markdown("# Vision 2030 Virtual Assistant")
639
  gr.Markdown("There was an error initializing the assistant. Please check the logs.")
640
  gr.Markdown(f"Error: {str(e)}")
641
  return demo
642
 
643
+ # Launch the app
644
+ demo = create_interface()
645
  demo.launch()