Spaces:

abdull4h
/

vision-2030-virtual-assistant

Running

App Files Files Community

abdull4h commited on Mar 21

Commit

ea4bdbe

verified ·

1 Parent(s): 8f83e1c

Update app.py

Browse files

Files changed (1) hide show

app.py +305 -250

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 import os
 import re
 import torch
-import gradio as gr
 import numpy as np
-from pathlib import Path
 from tqdm import tqdm
-import json
 # PDF processing
 import PyPDF2
@@ -26,15 +26,16 @@ from bidi.algorithm import get_display
 # Evaluation
 from rouge_score import rouge_scorer
-# Helper functions from your notebook
-def detect_language(text):
-    """Detect if text is primarily Arabic or English"""
-    # Simple heuristic: count Arabic characters
-    arabic_chars = re.findall(r'[\u0600-\u06FF]', text)
-    is_arabic = len(arabic_chars) > len(text) * 0.5
-    return "arabic" if is_arabic else "english"
 def safe_tokenize(text):
     """Pure regex tokenizer with no NLTK dependency"""
     if not text:
@@ -44,7 +45,14 @@ def safe_tokenize(text):
     # Split on whitespace and filter empty strings
     return [token for token in re.split(r'\s+', text.lower()) if token]
-# Evaluation metric functions
 def calculate_bleu(prediction, reference):
     """Calculate BLEU score without any NLTK dependency"""
     # Tokenize texts using our own tokenizer
@@ -116,12 +124,22 @@ def calculate_f1_precision_recall(prediction, reference):
     return {'precision': precision, 'recall': recall, 'f1': f1}
-# Load PDFs and create vector store
-def process_pdfs(pdf_files):
-    """Process uploaded PDF documents and return document objects"""
     documents = []
-    for pdf_path in pdf_files:
         try:
             text = ""
             with open(pdf_path, 'rb') as file:
@@ -180,6 +198,7 @@ def create_vector_store(documents):
     return vector_store
 def load_model_and_tokenizer():
     """Load the ALLaM-7B model and tokenizer with error handling"""
     model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
@@ -299,7 +318,7 @@ Question: {query} [/INST]</s>"""
         # Fallback response
         return "I apologize, but I encountered an error while generating a response."
-# Assistant class
 class Vision2030Assistant:
     def __init__(self, model, tokenizer, vector_store):
         self.model = model
@@ -344,8 +363,9 @@ class Vision2030Assistant:
         self.conversation_history = []
         return "Conversation has been reset."
-# Sample evaluation data (subset)
-sample_evaluation_data = [
     {
         "query": "ما هي رؤية السعودية 2030؟",
         "reference": "رؤية السعودية 2030 هي خطة استراتيجية تهدف إلى تنويع الاقتصاد السعودي وتقليل الاعتماد على النفط مع تطوير قطاعات مختلفة مثل الصحة والتعليم والسياحة.",
@@ -358,6 +378,8 @@ sample_evaluation_data = [
         "category": "overview",
         "language": "english"
     },
     {
         "query": "ما هي الأهداف الاقتصادية لرؤية 2030؟",
         "reference": "تشمل الأهداف الاقتصادية زيادة مساهمة القطاع الخاص إلى 65%، وزيادة الصادرات غير النفطية إلى 50% من الناتج المحلي غير النفطي، وخفض البطالة إلى 7%.",
@@ -370,261 +392,294 @@ sample_evaluation_data = [
         "category": "economic",
         "language": "english"
     },
     {
-        "query": "How does Vision 2030 support small and medium enterprises (SMEs)?",
-        "reference": "Vision 2030 supports SMEs by increasing their GDP contribution, facilitating access to funding, and reducing regulatory obstacles.",
-        "category": "economic",
         "language": "english"
     }
 ]
-# Global variables for storing state
-ASSISTANT = None
-MODEL = None
-TOKENIZER = None
-VECTOR_STORE = None
-PDF_PATHS = ["vision2030_docs/saudi_vision203.pdf", "vision2030_docs/saudi_vision2030_ar.pdf"]
-# Initialize evaluation
-rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
 def initialize_system():
-    global MODEL, TOKENIZER, VECTOR_STORE, ASSISTANT
-    # Try to load from saved files first
-    if os.path.exists("data/vision2030_vector_store"):
-        print("Loading vector store from saved file...")
-        try:
-            embedding_function = HuggingFaceEmbeddings(
-                model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-            )
-            VECTOR_STORE = FAISS.load_local("data/vision2030_vector_store", embedding_function)
-            print("Vector store loaded successfully!")
-        except Exception as e:
-            print(f"Error loading vector store: {e}")
-            VECTOR_STORE = None
-    # If vector store not loaded, process PDFs and create it
-    if VECTOR_STORE is None:
-        print("Processing PDF documents...")
-        vision2030_docs = process_pdfs(PDF_PATHS)
-        if not vision2030_docs:
-            return "Error: No documents were processed. Cannot continue."
-        print("Creating vector store...")
-        VECTOR_STORE = create_vector_store(vision2030_docs)
-        # Save the vector store for future use
-        os.makedirs("data", exist_ok=True)
-        VECTOR_STORE.save_local("data/vision2030_vector_store")
-        print("Vector store saved to data/vision2030_vector_store")
     # Load model and tokenizer
-    print("Loading ALLaM-7B model...")
-    MODEL, TOKENIZER = load_model_and_tokenizer()
     # Initialize assistant
-    ASSISTANT = Vision2030Assistant(MODEL, TOKENIZER, VECTOR_STORE)
-    print("Vision 2030 Assistant initialized successfully!")
-    return "System initialized and ready!"
-def process_query(query, reference=None):
-    """Process a user query and return the response with evaluation if reference is provided"""
-    if ASSISTANT is None:
-        return "System not initialized. Please initialize first.", "", "", "", ""
-    # Process query
-    response, sources, contexts = ASSISTANT.answer(query)
-    # Additional details
-    language = detect_language(query)
-    source_text = "\n".join([f"Source: {s}" for s in sources])
-    context_text = "\n\n".join([f"Context {i+1}: {ctx['content'][:200]}..." for i, ctx in enumerate(contexts)])
-    # Calculate metrics if reference is provided
-    metrics_text = ""
-    if reference:
-        # ROUGE scores
-        rouge_scores = rouge_scorer_instance.score(response, reference)
-        # BLEU scores
-        bleu_scores = calculate_bleu(response, reference)
-        # METEOR score
-        meteor = calculate_meteor(response, reference)
-        # F1, Precision, Recall
-        word_metrics = calculate_f1_precision_recall(response, reference)
-        # Format metrics text
-        metrics_text = f"""
-## Evaluation Metrics:
-- **ROUGE-1**: {rouge_scores['rouge1'].fmeasure:.4f}
-- **ROUGE-L**: {rouge_scores['rougeL'].fmeasure:.4f}
-- **BLEU-1**: {bleu_scores['bleu_1']:.4f}
-- **BLEU-4**: {bleu_scores['bleu_4']:.4f}
-- **METEOR**: {meteor:.4f}
-- **Word F1**: {word_metrics['f1']:.4f}
-- **Word Precision**: {word_metrics['precision']:.4f}
-- **Word Recall**: {word_metrics['recall']:.4f}
-        """
-    return response, source_text, context_text, metrics_text, language
-def evaluate_sample(sample_index):
-    """Evaluate a sample from the predefined evaluation dataset"""
-    if sample_index < 0 or sample_index >= len(sample_evaluation_data):
-        return "Invalid sample index", "", "", "", ""
-    sample = sample_evaluation_data[sample_index]
     query = sample["query"]
     reference = sample["reference"]
-    # Process the query with the reference for evaluation
-    response, source_text, context_text, metrics_text, language = process_query(query, reference)
-    # Add reference to the output
-    reference_text = f"""
-## Reference Answer:
-{reference}
-    """
-    return response, source_text, context_text, metrics_text + reference_text, language
-def reset_chat():
-    """Reset the conversation history"""
-    if ASSISTANT:
-        ASSISTANT.reset_conversation()
-        return "Conversation has been reset."
-    return "System not initialized."
-def qualitative_feedback(response, user_feedback, feedback_type):
-    """Save qualitative feedback from users"""
-    try:
-        feedback_data = {
-            "response": response,
-            "user_feedback": user_feedback,
-            "feedback_type": feedback_type,
-            "timestamp": str(datetime.datetime.now())
-        }
-        # Ensure directory exists
-        os.makedirs("feedback", exist_ok=True)
-        # Append to feedback file
-        with open("feedback/user_feedback.jsonl", "a") as f:
-            f.write(json.dumps(feedback_data) + "\n")
-        return f"Thank you for your {feedback_type} feedback!"
-    except Exception as e:
-        return f"Error saving feedback: {e}"
-# Create Gradio interface
-with gr.Blocks(title="Vision 2030 Assistant - Qualitative Evaluation") as demo:
-    gr.Markdown("# Vision 2030 Virtual Assistant - Qualitative Evaluation")
-    gr.Markdown("This interface allows you to interact with and evaluate the multilingual Vision 2030 Assistant.")
-    with gr.Tab("System Initialization"):
-        init_button = gr.Button("Initialize System")
-        init_output = gr.Textbox(label="Initialization Status")
-        init_button.click(initialize_system, inputs=[], outputs=[init_output])
-    with gr.Tab("Chat & Evaluation"):
-        with gr.Row():
-            with gr.Column(scale=2):
-                query_input = gr.Textbox(label="Ask about Saudi Vision 2030 (in English or Arabic)", lines=3)
-                reference_input = gr.Textbox(label="Reference Answer (Optional - for evaluation)", lines=3)
-                with gr.Row():
-                    submit_btn = gr.Button("Submit")
-                    reset_btn = gr.Button("Reset Chat")
-                response_output = gr.Textbox(label="Response", lines=6)
-                with gr.Accordion("Evaluation Metrics", open=False):
-                    metrics_output = gr.Markdown()
-                with gr.Accordion("Retrieved Sources", open=False):
-                    sources_output = gr.Textbox(label="Sources")
-                with gr.Accordion("Retrieved Contexts", open=False):
-                    contexts_output = gr.Textbox(label="Contexts", lines=10)
-                with gr.Accordion("Qualitative Feedback", open=False):
-                    feedback_text = gr.Textbox(label="Your Feedback", lines=3)
-                    feedback_type = gr.Radio(
-                        ["Correctness", "Relevance", "Fluency", "Completeness", "Other"],
-                        label="Feedback Type"
-                    )
-                    feedback_btn = gr.Button("Submit Feedback")
-                    feedback_output = gr.Textbox(label="Feedback Status")
-    with gr.Tab("Sample Evaluation"):
-        sample_index = gr.Slider(0, len(sample_evaluation_data)-1, 0, step=1, label="Sample Index")
-        eval_btn = gr.Button("Evaluate Sample")
-        sample_response = gr.Textbox(label="Response", lines=6)
-        sample_metrics = gr.Markdown(label="Metrics & Reference")
-        with gr.Accordion("Retrieved Sources", open=False):
-            sample_sources = gr.Textbox(label="Sources")
-        with gr.Accordion("Retrieved Contexts", open=False):
-            sample_contexts = gr.Textbox(label="Contexts", lines=10)
-    with gr.Tab("About"):
-        gr.Markdown("""
-        ## Vision 2030 Assistant
-        This is a multilingual RAG-based Conversational Agent using ALLaM-7B for answering questions about Saudi Vision 2030.
-        ### Features:
-        - Supports both Arabic and English queries
-        - Uses Retrieval-Augmented Generation (RAG) for accurate answers
-        - Provides transparent sources for information
-        - Comprehensive evaluation metrics
-        ### How to use:
-        1. Initialize the system (first tab)
-        2. Ask questions about Saudi Vision 2030 in the Chat tab
-        3. Optionally provide reference answers for evaluation
-        4. Explore sample evaluations from our test dataset
-        ### Evaluation Metrics:
-        - ROUGE: Measures overlap of n-grams between response and reference
-        - BLEU: Measures precision of n-grams in the response compared to reference
-        - METEOR: Measures semantic similarity between response and reference
-        - F1/Precision/Recall: Word-level comparison metrics
-        """)
-    # Set up event handlers
-    submit_btn.click(
-        process_query,
-        inputs=[query_input, reference_input],
-        outputs=[response_output, sources_output, contexts_output, metrics_output]
-    )
-    reset_btn.click(
-        reset_chat,
-        inputs=[],
-        outputs=[response_output]
-    )
-    eval_btn.click(
-        evaluate_sample,
-        inputs=[sample_index],
-        outputs=[sample_response, sample_sources, sample_contexts, sample_metrics]
-    )
-    feedback_btn.click(
-        qualitative_feedback,
-        inputs=[response_output, feedback_text, feedback_type],
-        outputs=[feedback_output]
-    )
-# Launch the interface
 if __name__ == "__main__":
-    demo.launch()

 import os
 import re
+import json
 import torch
 import numpy as np
+import pandas as pd
 from tqdm import tqdm
+from pathlib import Path
 # PDF processing
 import PyPDF2
 # Evaluation
 from rouge_score import rouge_scorer
+import sacrebleu
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+import matplotlib.pyplot as plt
+import seaborn as sns
+from collections import defaultdict
+# Gradio for the interface
+import gradio as gr
+# Helper functions
 def safe_tokenize(text):
     """Pure regex tokenizer with no NLTK dependency"""
     if not text:
     # Split on whitespace and filter empty strings
     return [token for token in re.split(r'\s+', text.lower()) if token]
+def detect_language(text):
+    """Detect if text is primarily Arabic or English"""
+    # Simple heuristic: count Arabic characters
+    arabic_chars = re.findall(r'[\u0600-\u06FF]', text)
+    is_arabic = len(arabic_chars) > len(text) * 0.5
+    return "arabic" if is_arabic else "english"
+# Evaluation metrics
 def calculate_bleu(prediction, reference):
     """Calculate BLEU score without any NLTK dependency"""
     # Tokenize texts using our own tokenizer
     return {'precision': precision, 'recall': recall, 'f1': f1}
+def evaluate_retrieval_quality(contexts, query, language):
+    """Evaluate the quality of retrieved contexts"""
+    # This is a placeholder function that should be implemented based on
+    # how you want to evaluate retrieval quality
+    return {
+        'language_match_ratio': 1.0,  # Placeholder
+        'source_diversity': len(set([ctx.get('source', '') for ctx in contexts])) / max(1, len(contexts)),
+        'mrr': 1.0  # Placeholder for Mean Reciprocal Rank
+    }
+# PDF Processing and Vector Store
+def simple_process_pdfs(pdf_paths):
+    """Process PDF documents and return document objects"""
     documents = []
+    for pdf_path in pdf_paths:
         try:
             text = ""
             with open(pdf_path, 'rb') as file:
     return vector_store
+# Model Loading and RAG System
 def load_model_and_tokenizer():
     """Load the ALLaM-7B model and tokenizer with error handling"""
     model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
         # Fallback response
         return "I apologize, but I encountered an error while generating a response."
+# Assistant Class
 class Vision2030Assistant:
     def __init__(self, model, tokenizer, vector_store):
         self.model = model
         self.conversation_history = []
         return "Conversation has been reset."
+# Comprehensive evaluation dataset
+comprehensive_evaluation_data = [
+    # === Overview ===
     {
         "query": "ما هي رؤية السعودية 2030؟",
         "reference": "رؤية السعودية 2030 هي خطة استراتيجية تهدف إلى تنويع الاقتصاد السعودي وتقليل الاعتماد على النفط مع تطوير قطاعات مختلفة مثل الصحة والتعليم والسياحة.",
         "category": "overview",
         "language": "english"
     },
+    # === Economic Goals ===
     {
         "query": "ما هي الأهداف الاقتصادية لرؤية 2030؟",
         "reference": "تشمل الأهداف الاقتصادية زيادة مساهمة القطاع الخاص إلى 65%، وزيادة الصادرات غير النفطية إلى 50% من الناتج المحلي غير النفطي، وخفض البطالة إلى 7%.",
         "category": "economic",
         "language": "english"
     },
+    # === Social Goals ===
     {
+        "query": "كيف تعزز رؤية 2030 الإرث الثقافي السعود��؟",
+        "reference": "تتضمن رؤية 2030 الحفاظ على الهوية الوطنية، تسجيل مواقع أثرية في اليونسكو، وتعزيز الفعاليات الثقافية.",
+        "category": "social",
+        "language": "arabic"
+    },
+    {
+        "query": "How does Vision 2030 aim to improve quality of life?",
+        "reference": "Vision 2030 plans to enhance quality of life by expanding sports facilities, promoting cultural activities, and boosting tourism and entertainment sectors.",
+        "category": "social",
         "language": "english"
     }
 ]
+# Gradio Interface
 def initialize_system():
+    """Initialize the Vision 2030 Assistant system"""
+    # This would normally process PDFs and load models
+    # For Hugging Face Space, we'll need to check if models are already downloaded
+    # and if vector stores are already created
+    # Define paths
+    model_dir = "models"
+    vector_store_dir = "vector_stores"
+    pdf_dir = "pdf_data"
+    os.makedirs(model_dir, exist_ok=True)
+    os.makedirs(vector_store_dir, exist_ok=True)
+    os.makedirs(pdf_dir, exist_ok=True)
+    # Check if we need to download PDFs
+    pdf_files = ["vision2030_docs/saudi_vision203.pdf", "vision2030_docs/saudi_vision2030_ar.pdf"]
+    # This is where you would normally download the PDFs if they don't exist
+    # For Hugging Face Space, you would need to upload these files
+    # Process PDFs and create vector store
+    if os.path.exists(os.path.join(vector_store_dir, "index.faiss")):
+        print("Loading existing vector store...")
+        embedding_function = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
+        )
+        vector_store = FAISS.load_local(vector_store_dir, embedding_function)
+    else:
+        print("Creating new vector store...")
+        documents = simple_process_pdfs(pdf_files)
+        vector_store = create_vector_store(documents)
+        vector_store.save_local(vector_store_dir)
     # Load model and tokenizer
+    model, tokenizer = load_model_and_tokenizer()
     # Initialize assistant
+    assistant = Vision2030Assistant(model, tokenizer, vector_store)
+    return assistant
+def evaluate_response(query, response, reference):
+    """Evaluate a single response against a reference"""
+    # Calculate metrics
+    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+    rouge_scores = rouge.score(response, reference)
+    bleu_scores = calculate_bleu(response, reference)
+    meteor = calculate_meteor(response, reference)
+    word_metrics = calculate_f1_precision_recall(response, reference)
+    # Format results
+    evaluation_results = {
+        "ROUGE-1": f"{rouge_scores['rouge1'].fmeasure:.4f}",
+        "ROUGE-2": f"{rouge_scores['rouge2'].fmeasure:.4f}",
+        "ROUGE-L": f"{rouge_scores['rougeL'].fmeasure:.4f}",
+        "BLEU-1": f"{bleu_scores['bleu_1']:.4f}",
+        "BLEU-4": f"{bleu_scores['bleu_4']:.4f}",
+        "METEOR": f"{meteor:.4f}",
+        "Word Precision": f"{word_metrics['precision']:.4f}",
+        "Word Recall": f"{word_metrics['recall']:.4f}",
+        "Word F1": f"{word_metrics['f1']:.4f}"
+    }
+    return evaluation_results
+def run_conversation(assistant, query):
+    """Run a query through the assistant and return the response"""
+    response, sources, contexts = assistant.answer(query)
+    return response, sources, contexts
+def run_evaluation_on_sample(assistant, sample_index=0):
+    """Run evaluation on a selected sample from the evaluation dataset"""
+    if sample_index < 0 or sample_index >= len(comprehensive_evaluation_data):
+        return "Invalid sample index", "", "", {}
+    # Get the sample
+    sample = comprehensive_evaluation_data[sample_index]
     query = sample["query"]
     reference = sample["reference"]
+    category = sample["category"]
+    language = sample["language"]
+    # Reset conversation and get response
+    assistant.reset_conversation()
+    response, sources, contexts = assistant.answer(query)
+    # Evaluate response
+    evaluation_results = evaluate_response(query, response, reference)
+    # Format for display
+    metrics_str = "\n".join([f"{k}: {v}" for k, v in evaluation_results.items()])
+    return query, response, reference, evaluation_results, sources, category, language
+def qualitative_evaluation_interface(assistant):
+    """Create a Gradio interface for qualitative evaluation"""
+    sample_options = [f"{i+1}. {item['query'][:50]}..." for i, item in enumerate(comprehensive_evaluation_data)]
+    with gr.Blocks(title="Vision 2030 Assistant - Qualitative Evaluation") as interface:
+        gr.Markdown("# Vision 2030 Assistant - Qualitative Evaluation")
+        gr.Markdown("This interface allows you to evaluate the Vision 2030 Assistant on predefined samples or your own queries.")
+        with gr.Tab("Sample Evaluation"):
+            gr.Markdown("### Evaluate the assistant on predefined samples")
+            sample_dropdown = gr.Dropdown(
+                choices=sample_options,
+                label="Select a sample query",
+                value=sample_options[0] if sample_options else None
+            )
+            eval_button = gr.Button("Evaluate Sample")
+            with gr.Row():
+                with gr.Column():
+                    sample_query = gr.Textbox(label="Query")
+                    sample_category = gr.Textbox(label="Category")
+                    sample_language = gr.Textbox(label="Language")
+                with gr.Column():
+                    sample_response = gr.Textbox(label="Assistant Response")
+                    sample_reference = gr.Textbox(label="Reference Answer")
+                    sample_sources = gr.Textbox(label="Sources Used")
+            with gr.Row():
+                metrics_display = gr.JSON(label="Evaluation Metrics")
+        with gr.Tab("Custom Evaluation"):
+            gr.Markdown("### Evaluate the assistant on your own query")
+            custom_query = gr.Textbox(
+                lines=3,
+                placeholder="Enter your question about Saudi Vision 2030...",
+                label="Your Query"
+            )
+            custom_reference = gr.Textbox(
+                lines=3,
+                placeholder="Enter a reference answer (optional)...",
+                label="Reference Answer (Optional)"
+            )
+            custom_eval_button = gr.Button("Get Response and Evaluate")
+            custom_response = gr.Textbox(label="Assistant Response")
+            custom_sources = gr.Textbox(label="Sources Used")
+            custom_metrics = gr.JSON(
+                label="Evaluation Metrics (if reference provided)",
+                visible=True
+            )
+        with gr.Tab("Conversation Mode"):
+            gr.Markdown("### Have a conversation with the Vision 2030 Assistant")
+            chatbot = gr.Chatbot(label="Conversation")
+            conv_input = gr.Textbox(
+                placeholder="Ask about Saudi Vision 2030...",
+                label="Your message"
+            )
+            with gr.Row():
+                conv_button = gr.Button("Send")
+                reset_button = gr.Button("Reset Conversation")
+            conv_sources = gr.Textbox(label="Sources Used")
+        # Sample evaluation event handlers
+        def handle_sample_selection(selection):
+            if not selection:
+                return "", "", "", "", "", "", ""
+            # Extract index from the selection string
+            try:
+                index = int(selection.split(".")[0]) - 1
+                query, response, reference, metrics, sources, category, language = run_evaluation_on_sample(assistant, index)
+                sources_str = ", ".join(sources)
+                return query, response, reference, metrics, sources_str, category, language
+            except:
+                return "Error processing selection", "", "", {}, "", "", ""
+        eval_button.click(
+            handle_sample_selection,
+            inputs=[sample_dropdown],
+            outputs=[sample_query, sample_response, sample_reference, metrics_display,
+                    sample_sources, sample_category, sample_language]
+        )
+        sample_dropdown.change(
+            handle_sample_selection,
+            inputs=[sample_dropdown],
+            outputs=[sample_query, sample_response, sample_reference, metrics_display,
+                    sample_sources, sample_category, sample_language]
+        )
+        # Custom evaluation event handlers
+        def handle_custom_evaluation(query, reference):
+            if not query:
+                return "Please enter a query", "", {}
+            # Reset conversation to ensure clean state
+            assistant.reset_conversation()
+            # Get response
+            response, sources, _ = assistant.answer(query)
+            sources_str = ", ".join(sources)
+            # Evaluate if reference is provided
+            metrics = {}
+            if reference:
+                metrics = evaluate_response(query, response, reference)
+            return response, sources_str, metrics
+        custom_eval_button.click(
+            handle_custom_evaluation,
+            inputs=[custom_query, custom_reference],
+            outputs=[custom_response, custom_sources, custom_metrics]
+        )
+        # Conversation mode event handlers
+        def handle_conversation(message, history):
+            if not message:
+                return history, "", ""
+            # Get response
+            response, sources, _ = assistant.answer(message)
+            sources_str = ", ".join(sources)
+            # Update history
+            history = history + [[message, response]]
+            return history, "", sources_str
+        def reset_conv():
+            result = assistant.reset_conversation()
+            return [], result, ""
+        conv_button.click(
+            handle_conversation,
+            inputs=[conv_input, chatbot],
+            outputs=[chatbot, conv_input, conv_sources]
+        )
+        reset_button.click(
+            reset_conv,
+            inputs=[],
+            outputs=[chatbot, conv_input, conv_sources]
+        )
+    return interface
+# Main function to run in Hugging Face Space
+def main():
+    # Initialize the system
+    try:
+        assistant = initialize_system()
+        interface = qualitative_evaluation_interface(assistant)
+        interface.launch()
+    except Exception as e:
+        print(f"Error initializing system: {e}")
+        # Create a simple error interface
+        gr.Interface(
+            fn=lambda x: f"System initialization failed: {str(e)}",
+            inputs=gr.Textbox(placeholder="System failed to initialize"),
+            outputs=gr.Textbox()
+        ).launch()
 if __name__ == "__main__":
+    main()