abdull4h commited on
Commit
ea4bdbe
·
verified ·
1 Parent(s): 8f83e1c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +305 -250
app.py CHANGED
@@ -1,11 +1,11 @@
1
  import os
2
  import re
 
3
  import torch
4
- import gradio as gr
5
  import numpy as np
6
- from pathlib import Path
7
  from tqdm import tqdm
8
- import json
9
 
10
  # PDF processing
11
  import PyPDF2
@@ -26,15 +26,16 @@ from bidi.algorithm import get_display
26
 
27
  # Evaluation
28
  from rouge_score import rouge_scorer
 
 
 
 
 
29
 
30
- # Helper functions from your notebook
31
- def detect_language(text):
32
- """Detect if text is primarily Arabic or English"""
33
- # Simple heuristic: count Arabic characters
34
- arabic_chars = re.findall(r'[\u0600-\u06FF]', text)
35
- is_arabic = len(arabic_chars) > len(text) * 0.5
36
- return "arabic" if is_arabic else "english"
37
 
 
38
  def safe_tokenize(text):
39
  """Pure regex tokenizer with no NLTK dependency"""
40
  if not text:
@@ -44,7 +45,14 @@ def safe_tokenize(text):
44
  # Split on whitespace and filter empty strings
45
  return [token for token in re.split(r'\s+', text.lower()) if token]
46
 
47
- # Evaluation metric functions
 
 
 
 
 
 
 
48
  def calculate_bleu(prediction, reference):
49
  """Calculate BLEU score without any NLTK dependency"""
50
  # Tokenize texts using our own tokenizer
@@ -116,12 +124,22 @@ def calculate_f1_precision_recall(prediction, reference):
116
 
117
  return {'precision': precision, 'recall': recall, 'f1': f1}
118
 
119
- # Load PDFs and create vector store
120
- def process_pdfs(pdf_files):
121
- """Process uploaded PDF documents and return document objects"""
 
 
 
 
 
 
 
 
 
 
122
  documents = []
123
 
124
- for pdf_path in pdf_files:
125
  try:
126
  text = ""
127
  with open(pdf_path, 'rb') as file:
@@ -180,6 +198,7 @@ def create_vector_store(documents):
180
 
181
  return vector_store
182
 
 
183
  def load_model_and_tokenizer():
184
  """Load the ALLaM-7B model and tokenizer with error handling"""
185
  model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
@@ -299,7 +318,7 @@ Question: {query} [/INST]</s>"""
299
  # Fallback response
300
  return "I apologize, but I encountered an error while generating a response."
301
 
302
- # Assistant class
303
  class Vision2030Assistant:
304
  def __init__(self, model, tokenizer, vector_store):
305
  self.model = model
@@ -344,8 +363,9 @@ class Vision2030Assistant:
344
  self.conversation_history = []
345
  return "Conversation has been reset."
346
 
347
- # Sample evaluation data (subset)
348
- sample_evaluation_data = [
 
349
  {
350
  "query": "ما هي رؤية السعودية 2030؟",
351
  "reference": "رؤية السعودية 2030 هي خطة استراتيجية تهدف إلى تنويع الاقتصاد السعودي وتقليل الاعتماد على النفط مع تطوير قطاعات مختلفة مثل الصحة والتعليم والسياحة.",
@@ -358,6 +378,8 @@ sample_evaluation_data = [
358
  "category": "overview",
359
  "language": "english"
360
  },
 
 
361
  {
362
  "query": "ما هي الأهداف الاقتصادية لرؤية 2030؟",
363
  "reference": "تشمل الأهداف الاقتصادية زيادة مساهمة القطاع الخاص إلى 65%، وزيادة الصادرات غير النفطية إلى 50% من الناتج المحلي غير النفطي، وخفض البطالة إلى 7%.",
@@ -370,261 +392,294 @@ sample_evaluation_data = [
370
  "category": "economic",
371
  "language": "english"
372
  },
 
 
373
  {
374
- "query": "How does Vision 2030 support small and medium enterprises (SMEs)?",
375
- "reference": "Vision 2030 supports SMEs by increasing their GDP contribution, facilitating access to funding, and reducing regulatory obstacles.",
376
- "category": "economic",
 
 
 
 
 
 
377
  "language": "english"
378
  }
379
  ]
380
 
381
- # Global variables for storing state
382
- ASSISTANT = None
383
- MODEL = None
384
- TOKENIZER = None
385
- VECTOR_STORE = None
386
- PDF_PATHS = ["vision2030_docs/saudi_vision203.pdf", "vision2030_docs/saudi_vision2030_ar.pdf"]
387
-
388
- # Initialize evaluation
389
- rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
390
-
391
  def initialize_system():
392
- global MODEL, TOKENIZER, VECTOR_STORE, ASSISTANT
393
-
394
- # Try to load from saved files first
395
- if os.path.exists("data/vision2030_vector_store"):
396
- print("Loading vector store from saved file...")
397
- try:
398
- embedding_function = HuggingFaceEmbeddings(
399
- model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
400
- )
401
- VECTOR_STORE = FAISS.load_local("data/vision2030_vector_store", embedding_function)
402
- print("Vector store loaded successfully!")
403
- except Exception as e:
404
- print(f"Error loading vector store: {e}")
405
- VECTOR_STORE = None
406
-
407
- # If vector store not loaded, process PDFs and create it
408
- if VECTOR_STORE is None:
409
- print("Processing PDF documents...")
410
- vision2030_docs = process_pdfs(PDF_PATHS)
411
-
412
- if not vision2030_docs:
413
- return "Error: No documents were processed. Cannot continue."
414
-
415
- print("Creating vector store...")
416
- VECTOR_STORE = create_vector_store(vision2030_docs)
417
-
418
- # Save the vector store for future use
419
- os.makedirs("data", exist_ok=True)
420
- VECTOR_STORE.save_local("data/vision2030_vector_store")
421
- print("Vector store saved to data/vision2030_vector_store")
 
 
422
 
423
  # Load model and tokenizer
424
- print("Loading ALLaM-7B model...")
425
- MODEL, TOKENIZER = load_model_and_tokenizer()
426
 
427
  # Initialize assistant
428
- ASSISTANT = Vision2030Assistant(MODEL, TOKENIZER, VECTOR_STORE)
429
- print("Vision 2030 Assistant initialized successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
- return "System initialized and ready!"
432
 
433
- def process_query(query, reference=None):
434
- """Process a user query and return the response with evaluation if reference is provided"""
435
- if ASSISTANT is None:
436
- return "System not initialized. Please initialize first.", "", "", "", ""
437
-
438
- # Process query
439
- response, sources, contexts = ASSISTANT.answer(query)
440
-
441
- # Additional details
442
- language = detect_language(query)
443
- source_text = "\n".join([f"Source: {s}" for s in sources])
444
- context_text = "\n\n".join([f"Context {i+1}: {ctx['content'][:200]}..." for i, ctx in enumerate(contexts)])
445
-
446
- # Calculate metrics if reference is provided
447
- metrics_text = ""
448
- if reference:
449
- # ROUGE scores
450
- rouge_scores = rouge_scorer_instance.score(response, reference)
451
-
452
- # BLEU scores
453
- bleu_scores = calculate_bleu(response, reference)
454
-
455
- # METEOR score
456
- meteor = calculate_meteor(response, reference)
457
-
458
- # F1, Precision, Recall
459
- word_metrics = calculate_f1_precision_recall(response, reference)
460
-
461
- # Format metrics text
462
- metrics_text = f"""
463
- ## Evaluation Metrics:
464
- - **ROUGE-1**: {rouge_scores['rouge1'].fmeasure:.4f}
465
- - **ROUGE-L**: {rouge_scores['rougeL'].fmeasure:.4f}
466
- - **BLEU-1**: {bleu_scores['bleu_1']:.4f}
467
- - **BLEU-4**: {bleu_scores['bleu_4']:.4f}
468
- - **METEOR**: {meteor:.4f}
469
- - **Word F1**: {word_metrics['f1']:.4f}
470
- - **Word Precision**: {word_metrics['precision']:.4f}
471
- - **Word Recall**: {word_metrics['recall']:.4f}
472
- """
473
-
474
- return response, source_text, context_text, metrics_text, language
475
 
476
- def evaluate_sample(sample_index):
477
- """Evaluate a sample from the predefined evaluation dataset"""
478
- if sample_index < 0 or sample_index >= len(sample_evaluation_data):
479
- return "Invalid sample index", "", "", "", ""
480
 
481
- sample = sample_evaluation_data[sample_index]
 
482
  query = sample["query"]
483
  reference = sample["reference"]
 
 
484
 
485
- # Process the query with the reference for evaluation
486
- response, source_text, context_text, metrics_text, language = process_query(query, reference)
 
487
 
488
- # Add reference to the output
489
- reference_text = f"""
490
- ## Reference Answer:
491
- {reference}
492
- """
493
 
494
- return response, source_text, context_text, metrics_text + reference_text, language
495
-
496
- def reset_chat():
497
- """Reset the conversation history"""
498
- if ASSISTANT:
499
- ASSISTANT.reset_conversation()
500
- return "Conversation has been reset."
501
- return "System not initialized."
502
 
503
- def qualitative_feedback(response, user_feedback, feedback_type):
504
- """Save qualitative feedback from users"""
505
- try:
506
- feedback_data = {
507
- "response": response,
508
- "user_feedback": user_feedback,
509
- "feedback_type": feedback_type,
510
- "timestamp": str(datetime.datetime.now())
511
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
- # Ensure directory exists
514
- os.makedirs("feedback", exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
516
- # Append to feedback file
517
- with open("feedback/user_feedback.jsonl", "a") as f:
518
- f.write(json.dumps(feedback_data) + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
519
 
520
- return f"Thank you for your {feedback_type} feedback!"
521
- except Exception as e:
522
- return f"Error saving feedback: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
- # Create Gradio interface
525
- with gr.Blocks(title="Vision 2030 Assistant - Qualitative Evaluation") as demo:
526
- gr.Markdown("# Vision 2030 Virtual Assistant - Qualitative Evaluation")
527
- gr.Markdown("This interface allows you to interact with and evaluate the multilingual Vision 2030 Assistant.")
528
-
529
- with gr.Tab("System Initialization"):
530
- init_button = gr.Button("Initialize System")
531
- init_output = gr.Textbox(label="Initialization Status")
532
-
533
- init_button.click(initialize_system, inputs=[], outputs=[init_output])
534
-
535
- with gr.Tab("Chat & Evaluation"):
536
- with gr.Row():
537
- with gr.Column(scale=2):
538
- query_input = gr.Textbox(label="Ask about Saudi Vision 2030 (in English or Arabic)", lines=3)
539
- reference_input = gr.Textbox(label="Reference Answer (Optional - for evaluation)", lines=3)
540
-
541
- with gr.Row():
542
- submit_btn = gr.Button("Submit")
543
- reset_btn = gr.Button("Reset Chat")
544
-
545
- response_output = gr.Textbox(label="Response", lines=6)
546
-
547
- with gr.Accordion("Evaluation Metrics", open=False):
548
- metrics_output = gr.Markdown()
549
-
550
- with gr.Accordion("Retrieved Sources", open=False):
551
- sources_output = gr.Textbox(label="Sources")
552
-
553
- with gr.Accordion("Retrieved Contexts", open=False):
554
- contexts_output = gr.Textbox(label="Contexts", lines=10)
555
-
556
- with gr.Accordion("Qualitative Feedback", open=False):
557
- feedback_text = gr.Textbox(label="Your Feedback", lines=3)
558
- feedback_type = gr.Radio(
559
- ["Correctness", "Relevance", "Fluency", "Completeness", "Other"],
560
- label="Feedback Type"
561
- )
562
- feedback_btn = gr.Button("Submit Feedback")
563
- feedback_output = gr.Textbox(label="Feedback Status")
564
-
565
- with gr.Tab("Sample Evaluation"):
566
- sample_index = gr.Slider(0, len(sample_evaluation_data)-1, 0, step=1, label="Sample Index")
567
- eval_btn = gr.Button("Evaluate Sample")
568
-
569
- sample_response = gr.Textbox(label="Response", lines=6)
570
- sample_metrics = gr.Markdown(label="Metrics & Reference")
571
-
572
- with gr.Accordion("Retrieved Sources", open=False):
573
- sample_sources = gr.Textbox(label="Sources")
574
-
575
- with gr.Accordion("Retrieved Contexts", open=False):
576
- sample_contexts = gr.Textbox(label="Contexts", lines=10)
577
-
578
- with gr.Tab("About"):
579
- gr.Markdown("""
580
- ## Vision 2030 Assistant
581
-
582
- This is a multilingual RAG-based Conversational Agent using ALLaM-7B for answering questions about Saudi Vision 2030.
583
-
584
- ### Features:
585
- - Supports both Arabic and English queries
586
- - Uses Retrieval-Augmented Generation (RAG) for accurate answers
587
- - Provides transparent sources for information
588
- - Comprehensive evaluation metrics
589
-
590
- ### How to use:
591
- 1. Initialize the system (first tab)
592
- 2. Ask questions about Saudi Vision 2030 in the Chat tab
593
- 3. Optionally provide reference answers for evaluation
594
- 4. Explore sample evaluations from our test dataset
595
-
596
- ### Evaluation Metrics:
597
- - ROUGE: Measures overlap of n-grams between response and reference
598
- - BLEU: Measures precision of n-grams in the response compared to reference
599
- - METEOR: Measures semantic similarity between response and reference
600
- - F1/Precision/Recall: Word-level comparison metrics
601
- """)
602
-
603
- # Set up event handlers
604
- submit_btn.click(
605
- process_query,
606
- inputs=[query_input, reference_input],
607
- outputs=[response_output, sources_output, contexts_output, metrics_output]
608
- )
609
-
610
- reset_btn.click(
611
- reset_chat,
612
- inputs=[],
613
- outputs=[response_output]
614
- )
615
-
616
- eval_btn.click(
617
- evaluate_sample,
618
- inputs=[sample_index],
619
- outputs=[sample_response, sample_sources, sample_contexts, sample_metrics]
620
- )
621
-
622
- feedback_btn.click(
623
- qualitative_feedback,
624
- inputs=[response_output, feedback_text, feedback_type],
625
- outputs=[feedback_output]
626
- )
627
 
628
- # Launch the interface
629
  if __name__ == "__main__":
630
- demo.launch()
 
1
  import os
2
  import re
3
+ import json
4
  import torch
 
5
  import numpy as np
6
+ import pandas as pd
7
  from tqdm import tqdm
8
+ from pathlib import Path
9
 
10
  # PDF processing
11
  import PyPDF2
 
26
 
27
  # Evaluation
28
  from rouge_score import rouge_scorer
29
+ import sacrebleu
30
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
31
+ import matplotlib.pyplot as plt
32
+ import seaborn as sns
33
+ from collections import defaultdict
34
 
35
+ # Gradio for the interface
36
+ import gradio as gr
 
 
 
 
 
37
 
38
+ # Helper functions
39
  def safe_tokenize(text):
40
  """Pure regex tokenizer with no NLTK dependency"""
41
  if not text:
 
45
  # Split on whitespace and filter empty strings
46
  return [token for token in re.split(r'\s+', text.lower()) if token]
47
 
48
+ def detect_language(text):
49
+ """Detect if text is primarily Arabic or English"""
50
+ # Simple heuristic: count Arabic characters
51
+ arabic_chars = re.findall(r'[\u0600-\u06FF]', text)
52
+ is_arabic = len(arabic_chars) > len(text) * 0.5
53
+ return "arabic" if is_arabic else "english"
54
+
55
+ # Evaluation metrics
56
  def calculate_bleu(prediction, reference):
57
  """Calculate BLEU score without any NLTK dependency"""
58
  # Tokenize texts using our own tokenizer
 
124
 
125
  return {'precision': precision, 'recall': recall, 'f1': f1}
126
 
127
+ def evaluate_retrieval_quality(contexts, query, language):
128
+ """Evaluate the quality of retrieved contexts"""
129
+ # This is a placeholder function that should be implemented based on
130
+ # how you want to evaluate retrieval quality
131
+ return {
132
+ 'language_match_ratio': 1.0, # Placeholder
133
+ 'source_diversity': len(set([ctx.get('source', '') for ctx in contexts])) / max(1, len(contexts)),
134
+ 'mrr': 1.0 # Placeholder for Mean Reciprocal Rank
135
+ }
136
+
137
+ # PDF Processing and Vector Store
138
+ def simple_process_pdfs(pdf_paths):
139
+ """Process PDF documents and return document objects"""
140
  documents = []
141
 
142
+ for pdf_path in pdf_paths:
143
  try:
144
  text = ""
145
  with open(pdf_path, 'rb') as file:
 
198
 
199
  return vector_store
200
 
201
+ # Model Loading and RAG System
202
  def load_model_and_tokenizer():
203
  """Load the ALLaM-7B model and tokenizer with error handling"""
204
  model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
 
318
  # Fallback response
319
  return "I apologize, but I encountered an error while generating a response."
320
 
321
+ # Assistant Class
322
  class Vision2030Assistant:
323
  def __init__(self, model, tokenizer, vector_store):
324
  self.model = model
 
363
  self.conversation_history = []
364
  return "Conversation has been reset."
365
 
366
+ # Comprehensive evaluation dataset
367
+ comprehensive_evaluation_data = [
368
+ # === Overview ===
369
  {
370
  "query": "ما هي رؤية السعودية 2030؟",
371
  "reference": "رؤية السعودية 2030 هي خطة استراتيجية تهدف إلى تنويع الاقتصاد السعودي وتقليل الاعتماد على النفط مع تطوير قطاعات مختلفة مثل الصحة والتعليم والسياحة.",
 
378
  "category": "overview",
379
  "language": "english"
380
  },
381
+
382
+ # === Economic Goals ===
383
  {
384
  "query": "ما هي الأهداف الاقتصادية لرؤية 2030؟",
385
  "reference": "تشمل الأهداف الاقتصادية زيادة مساهمة القطاع الخاص إلى 65%، وزيادة الصادرات غير النفطية إلى 50% من الناتج المحلي غير النفطي، وخفض البطالة إلى 7%.",
 
392
  "category": "economic",
393
  "language": "english"
394
  },
395
+
396
+ # === Social Goals ===
397
  {
398
+ "query": "كيف تعزز رؤية 2030 الإرث الثقافي السعود��؟",
399
+ "reference": "تتضمن رؤية 2030 الحفاظ على الهوية الوطنية، تسجيل مواقع أثرية في اليونسكو، وتعزيز الفعاليات الثقافية.",
400
+ "category": "social",
401
+ "language": "arabic"
402
+ },
403
+ {
404
+ "query": "How does Vision 2030 aim to improve quality of life?",
405
+ "reference": "Vision 2030 plans to enhance quality of life by expanding sports facilities, promoting cultural activities, and boosting tourism and entertainment sectors.",
406
+ "category": "social",
407
  "language": "english"
408
  }
409
  ]
410
 
411
+ # Gradio Interface
 
 
 
 
 
 
 
 
 
412
  def initialize_system():
413
+ """Initialize the Vision 2030 Assistant system"""
414
+ # This would normally process PDFs and load models
415
+ # For Hugging Face Space, we'll need to check if models are already downloaded
416
+ # and if vector stores are already created
417
+
418
+ # Define paths
419
+ model_dir = "models"
420
+ vector_store_dir = "vector_stores"
421
+ pdf_dir = "pdf_data"
422
+
423
+ os.makedirs(model_dir, exist_ok=True)
424
+ os.makedirs(vector_store_dir, exist_ok=True)
425
+ os.makedirs(pdf_dir, exist_ok=True)
426
+
427
+ # Check if we need to download PDFs
428
+ pdf_files = ["vision2030_docs/saudi_vision203.pdf", "vision2030_docs/saudi_vision2030_ar.pdf"]
429
+
430
+ # This is where you would normally download the PDFs if they don't exist
431
+ # For Hugging Face Space, you would need to upload these files
432
+
433
+ # Process PDFs and create vector store
434
+ if os.path.exists(os.path.join(vector_store_dir, "index.faiss")):
435
+ print("Loading existing vector store...")
436
+ embedding_function = HuggingFaceEmbeddings(
437
+ model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
438
+ )
439
+ vector_store = FAISS.load_local(vector_store_dir, embedding_function)
440
+ else:
441
+ print("Creating new vector store...")
442
+ documents = simple_process_pdfs(pdf_files)
443
+ vector_store = create_vector_store(documents)
444
+ vector_store.save_local(vector_store_dir)
445
 
446
  # Load model and tokenizer
447
+ model, tokenizer = load_model_and_tokenizer()
 
448
 
449
  # Initialize assistant
450
+ assistant = Vision2030Assistant(model, tokenizer, vector_store)
451
+
452
+ return assistant
453
+
454
+ def evaluate_response(query, response, reference):
455
+ """Evaluate a single response against a reference"""
456
+ # Calculate metrics
457
+ rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
458
+ rouge_scores = rouge.score(response, reference)
459
+
460
+ bleu_scores = calculate_bleu(response, reference)
461
+ meteor = calculate_meteor(response, reference)
462
+ word_metrics = calculate_f1_precision_recall(response, reference)
463
+
464
+ # Format results
465
+ evaluation_results = {
466
+ "ROUGE-1": f"{rouge_scores['rouge1'].fmeasure:.4f}",
467
+ "ROUGE-2": f"{rouge_scores['rouge2'].fmeasure:.4f}",
468
+ "ROUGE-L": f"{rouge_scores['rougeL'].fmeasure:.4f}",
469
+ "BLEU-1": f"{bleu_scores['bleu_1']:.4f}",
470
+ "BLEU-4": f"{bleu_scores['bleu_4']:.4f}",
471
+ "METEOR": f"{meteor:.4f}",
472
+ "Word Precision": f"{word_metrics['precision']:.4f}",
473
+ "Word Recall": f"{word_metrics['recall']:.4f}",
474
+ "Word F1": f"{word_metrics['f1']:.4f}"
475
+ }
476
 
477
+ return evaluation_results
478
 
479
+ def run_conversation(assistant, query):
480
+ """Run a query through the assistant and return the response"""
481
+ response, sources, contexts = assistant.answer(query)
482
+ return response, sources, contexts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
+ def run_evaluation_on_sample(assistant, sample_index=0):
485
+ """Run evaluation on a selected sample from the evaluation dataset"""
486
+ if sample_index < 0 or sample_index >= len(comprehensive_evaluation_data):
487
+ return "Invalid sample index", "", "", {}
488
 
489
+ # Get the sample
490
+ sample = comprehensive_evaluation_data[sample_index]
491
  query = sample["query"]
492
  reference = sample["reference"]
493
+ category = sample["category"]
494
+ language = sample["language"]
495
 
496
+ # Reset conversation and get response
497
+ assistant.reset_conversation()
498
+ response, sources, contexts = assistant.answer(query)
499
 
500
+ # Evaluate response
501
+ evaluation_results = evaluate_response(query, response, reference)
 
 
 
502
 
503
+ # Format for display
504
+ metrics_str = "\n".join([f"{k}: {v}" for k, v in evaluation_results.items()])
505
+
506
+ return query, response, reference, evaluation_results, sources, category, language
 
 
 
 
507
 
508
+ def qualitative_evaluation_interface(assistant):
509
+ """Create a Gradio interface for qualitative evaluation"""
510
+
511
+ sample_options = [f"{i+1}. {item['query'][:50]}..." for i, item in enumerate(comprehensive_evaluation_data)]
512
+
513
+ with gr.Blocks(title="Vision 2030 Assistant - Qualitative Evaluation") as interface:
514
+ gr.Markdown("# Vision 2030 Assistant - Qualitative Evaluation")
515
+ gr.Markdown("This interface allows you to evaluate the Vision 2030 Assistant on predefined samples or your own queries.")
516
+
517
+ with gr.Tab("Sample Evaluation"):
518
+ gr.Markdown("### Evaluate the assistant on predefined samples")
519
+
520
+ sample_dropdown = gr.Dropdown(
521
+ choices=sample_options,
522
+ label="Select a sample query",
523
+ value=sample_options[0] if sample_options else None
524
+ )
525
+
526
+ eval_button = gr.Button("Evaluate Sample")
527
+
528
+ with gr.Row():
529
+ with gr.Column():
530
+ sample_query = gr.Textbox(label="Query")
531
+ sample_category = gr.Textbox(label="Category")
532
+ sample_language = gr.Textbox(label="Language")
533
+
534
+ with gr.Column():
535
+ sample_response = gr.Textbox(label="Assistant Response")
536
+ sample_reference = gr.Textbox(label="Reference Answer")
537
+ sample_sources = gr.Textbox(label="Sources Used")
538
+
539
+ with gr.Row():
540
+ metrics_display = gr.JSON(label="Evaluation Metrics")
541
 
542
+ with gr.Tab("Custom Evaluation"):
543
+ gr.Markdown("### Evaluate the assistant on your own query")
544
+
545
+ custom_query = gr.Textbox(
546
+ lines=3,
547
+ placeholder="Enter your question about Saudi Vision 2030...",
548
+ label="Your Query"
549
+ )
550
+
551
+ custom_reference = gr.Textbox(
552
+ lines=3,
553
+ placeholder="Enter a reference answer (optional)...",
554
+ label="Reference Answer (Optional)"
555
+ )
556
+
557
+ custom_eval_button = gr.Button("Get Response and Evaluate")
558
+
559
+ custom_response = gr.Textbox(label="Assistant Response")
560
+ custom_sources = gr.Textbox(label="Sources Used")
561
+
562
+ custom_metrics = gr.JSON(
563
+ label="Evaluation Metrics (if reference provided)",
564
+ visible=True
565
+ )
566
 
567
+ with gr.Tab("Conversation Mode"):
568
+ gr.Markdown("### Have a conversation with the Vision 2030 Assistant")
569
+
570
+ chatbot = gr.Chatbot(label="Conversation")
571
+
572
+ conv_input = gr.Textbox(
573
+ placeholder="Ask about Saudi Vision 2030...",
574
+ label="Your message"
575
+ )
576
+
577
+ with gr.Row():
578
+ conv_button = gr.Button("Send")
579
+ reset_button = gr.Button("Reset Conversation")
580
+
581
+ conv_sources = gr.Textbox(label="Sources Used")
582
 
583
+ # Sample evaluation event handlers
584
+ def handle_sample_selection(selection):
585
+ if not selection:
586
+ return "", "", "", "", "", "", ""
587
+
588
+ # Extract index from the selection string
589
+ try:
590
+ index = int(selection.split(".")[0]) - 1
591
+ query, response, reference, metrics, sources, category, language = run_evaluation_on_sample(assistant, index)
592
+ sources_str = ", ".join(sources)
593
+ return query, response, reference, metrics, sources_str, category, language
594
+ except:
595
+ return "Error processing selection", "", "", {}, "", "", ""
596
+
597
+ eval_button.click(
598
+ handle_sample_selection,
599
+ inputs=[sample_dropdown],
600
+ outputs=[sample_query, sample_response, sample_reference, metrics_display,
601
+ sample_sources, sample_category, sample_language]
602
+ )
603
+
604
+ sample_dropdown.change(
605
+ handle_sample_selection,
606
+ inputs=[sample_dropdown],
607
+ outputs=[sample_query, sample_response, sample_reference, metrics_display,
608
+ sample_sources, sample_category, sample_language]
609
+ )
610
+
611
+ # Custom evaluation event handlers
612
+ def handle_custom_evaluation(query, reference):
613
+ if not query:
614
+ return "Please enter a query", "", {}
615
+
616
+ # Reset conversation to ensure clean state
617
+ assistant.reset_conversation()
618
+
619
+ # Get response
620
+ response, sources, _ = assistant.answer(query)
621
+ sources_str = ", ".join(sources)
622
+
623
+ # Evaluate if reference is provided
624
+ metrics = {}
625
+ if reference:
626
+ metrics = evaluate_response(query, response, reference)
627
+
628
+ return response, sources_str, metrics
629
+
630
+ custom_eval_button.click(
631
+ handle_custom_evaluation,
632
+ inputs=[custom_query, custom_reference],
633
+ outputs=[custom_response, custom_sources, custom_metrics]
634
+ )
635
+
636
+ # Conversation mode event handlers
637
+ def handle_conversation(message, history):
638
+ if not message:
639
+ return history, "", ""
640
+
641
+ # Get response
642
+ response, sources, _ = assistant.answer(message)
643
+ sources_str = ", ".join(sources)
644
+
645
+ # Update history
646
+ history = history + [[message, response]]
647
+
648
+ return history, "", sources_str
649
+
650
+ def reset_conv():
651
+ result = assistant.reset_conversation()
652
+ return [], result, ""
653
+
654
+ conv_button.click(
655
+ handle_conversation,
656
+ inputs=[conv_input, chatbot],
657
+ outputs=[chatbot, conv_input, conv_sources]
658
+ )
659
+
660
+ reset_button.click(
661
+ reset_conv,
662
+ inputs=[],
663
+ outputs=[chatbot, conv_input, conv_sources]
664
+ )
665
+
666
+ return interface
667
 
668
+ # Main function to run in Hugging Face Space
669
+ def main():
670
+ # Initialize the system
671
+ try:
672
+ assistant = initialize_system()
673
+ interface = qualitative_evaluation_interface(assistant)
674
+ interface.launch()
675
+ except Exception as e:
676
+ print(f"Error initializing system: {e}")
677
+ # Create a simple error interface
678
+ gr.Interface(
679
+ fn=lambda x: f"System initialization failed: {str(e)}",
680
+ inputs=gr.Textbox(placeholder="System failed to initialize"),
681
+ outputs=gr.Textbox()
682
+ ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
 
 
684
  if __name__ == "__main__":
685
+ main()