Spaces:

abdull4h
/

vision-2030-virtual-assistant

Running

App Files Files Community

abdull4h commited on Mar 19

Commit

39d753a

verified ·

1 Parent(s): 4cf7bd8

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -106

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# Vision 2030 Virtual Assistant with RAG and Evaluation Framework
-# Modified for Hugging Face Spaces compatibility with GPU support
 import gradio as gr
 import time
@@ -12,6 +12,7 @@ import pandas as pd
 import matplotlib.pyplot as plt
 from sklearn.metrics import precision_recall_fscore_support, accuracy_score
 import PyPDF2
 import json
 from langdetect import detect
 from sentence_transformers import SentenceTransformer
@@ -29,42 +30,29 @@ logging.basicConfig(
 )
 logger = logging.getLogger('vision2030_assistant')
-# Check for GPU availability
 has_gpu = torch.cuda.is_available()
 logger.info(f"GPU available: {has_gpu}")
 class Vision2030Assistant:
-    def __init__(self, pdf_path=None, eval_data_path=None):
-        """
-        Initialize the Vision 2030 Assistant with embedding models and evaluation framework
-        Args:
-            pdf_path: Path to the Vision 2030 PDF document
-            eval_data_path: Path to evaluation dataset
-        """
         logger.info("Initializing Vision 2030 Assistant...")
-        # Initialize embedding models only (no LLMs to avoid tokenizer issues)
         self.load_embedding_models()
-        # Load documents
-        if pdf_path and os.path.exists(pdf_path):
-            self.load_and_process_documents(pdf_path)
-        else:
-            self._create_sample_data()
-            self._create_indices()
-        # Setup evaluation framework
-        if eval_data_path and os.path.exists(eval_data_path):
-            with open(eval_data_path, 'r', encoding='utf-8') as f:
-                self.eval_data = json.load(f)
-        else:
-            self._create_sample_eval_data()
         self.metrics = {
             "response_times": [],
             "user_ratings": [],
-            "retrieval_precision": [],
             "factual_accuracy": []
         }
         self.response_history = []
@@ -73,7 +61,7 @@ class Vision2030Assistant:
     @spaces.GPU
     def load_embedding_models(self):
         """Load embedding models for retrieval with GPU support"""
-        logger.info("Loading embedding models with GPU support...")
         try:
             # Load embedding models
@@ -89,7 +77,7 @@ class Vision2030Assistant:
             logger.info("Embedding models loaded successfully")
         except Exception as e:
             logger.error(f"Error loading embedding models: {str(e)}")
-            # Create simple placeholder models if loading fails
             self._create_fallback_embedders()
     def _create_fallback_embedders(self):
@@ -118,51 +106,8 @@ class Vision2030Assistant:
         self.arabic_embedder = SimpleEmbedder()
         self.english_embedder = SimpleEmbedder()
-    def load_and_process_documents(self, pdf_path):
-        """Load and process the Vision 2030 document from PDF"""
-        logger.info(f"Processing Vision 2030 document from {pdf_path}")
-        # Initialize empty document lists
-        self.english_texts = []
-        self.arabic_texts = []
-        try:
-            # Extract text from PDF
-            with open(pdf_path, 'rb') as file:
-                reader = PyPDF2.PdfReader(file)
-                full_text = ""
-                for page_num in range(len(reader.pages)):
-                    page = reader.pages[page_num]
-                    full_text += page.extract_text() + "\n"
-            # Split into chunks (simple approach - could be improved with better text segmentation)
-            chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
-            # Detect language and add to appropriate list
-            for chunk in chunks:
-                try:
-                    lang = detect(chunk)
-                    if lang == "ar":
-                        self.arabic_texts.append(chunk)
-                    else:  # Default to English for other languages
-                        self.english_texts.append(chunk)
-                except:
-                    # If language detection fails, assume English
-                    self.english_texts.append(chunk)
-            logger.info(f"Processed {len(self.arabic_texts)} Arabic and {len(self.english_texts)} English chunks")
-            # Create FAISS indices
-            self._create_indices()
-        except Exception as e:
-            logger.error(f"Error processing PDF: {str(e)}")
-            logger.info("Using fallback sample data")
-            self._create_sample_data()
-            self._create_indices()
     def _create_sample_data(self):
-        """Create sample Vision 2030 data if PDF processing fails"""
         logger.info("Creating sample Vision 2030 data")
         # English sample texts
@@ -195,7 +140,7 @@ class Vision2030Assistant:
     @spaces.GPU
     def _create_indices(self):
-        """Create FAISS indices for fast text retrieval with GPU support"""
         logger.info("Creating FAISS indices for text retrieval")
         try:
@@ -275,23 +220,13 @@ class Vision2030Assistant:
                 "question": "ما هو مشروع البحر الأحمر؟",
                 "lang": "ar",
                 "reference_answer": "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
-            },
-            {
-                "question": "What are the goals for women's workforce participation?",
-                "lang": "en",
-                "reference_answer": "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
-            },
-            {
-                "question": "ما هي القدية؟",
-                "lang": "ar",
-                "reference_answer": "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030."
             }
         ]
         logger.info(f"Created {len(self.eval_data)} sample evaluation examples")
     @spaces.GPU
     def retrieve_context(self, query, lang):
-        """Retrieve relevant context for a query based on language with GPU support"""
         start_time = time.time()
         try:
@@ -323,7 +258,10 @@ class Vision2030Assistant:
             return ""
     def generate_response(self, user_input):
-        """Generate a response to user input using retrieval and predefined responses for evaluation"""
         start_time = time.time()
         # Default response in case of failure
@@ -346,7 +284,7 @@ class Vision2030Assistant:
             # Retrieve relevant context
             context = self.retrieve_context(user_input, lang)
-            # Simplified response generation for HF Spaces
             if lang == "ar":
                 if "ركائز" in user_input or "اركان" in user_input:
                     reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
@@ -358,6 +296,8 @@ class Vision2030Assistant:
                     reply = "تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪."
                 elif "القدية" in user_input:
                     reply = "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030."
                 else:
                     # Use the retrieved context directly if available
                     reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
@@ -372,6 +312,8 @@ class Vision2030Assistant:
                     reply = "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
                 elif "qiddiya" in user_input.lower():
                     reply = "Qiddiya is a entertainment mega-project being built in Riyadh as part of Vision 2030."
                 else:
                     # Use the retrieved context directly if available
                     reply = context if context else "I couldn't find enough information about this question."
@@ -422,7 +364,7 @@ class Vision2030Assistant:
     @spaces.GPU
     def evaluate_on_test_set(self):
-        """Evaluate the assistant on the test set with GPU support"""
         logger.info("Running evaluation on test set")
         eval_results = []
@@ -516,14 +458,70 @@ class Vision2030Assistant:
         return True
 # Create the Gradio interface
-def create_gradio_interface():
     try:
         # Initialize the assistant
         assistant = Vision2030Assistant()
         def chat(message, history):
-            if not message.strip():
                 return history, ""
             # Generate response
@@ -568,15 +566,6 @@ def create_gradio_interface():
             fig = assistant.visualize_evaluation_results(results)
             return summary, fig
-        @spaces.GPU
-        def process_uploaded_file(file):
-            if file is not None:
-                # Create a new assistant with the uploaded PDF
-                global assistant
-                assistant = Vision2030Assistant(pdf_path=file.name)
-                return f"Successfully processed {file.name}. The assistant is ready to use."
-            return "No file uploaded. Using sample data."
         # Create the Gradio interface
         with gr.Blocks() as demo:
@@ -603,9 +592,34 @@ def create_gradio_interface():
                 eval_chart = gr.Plot(label="Evaluation Metrics")
             with gr.Tab("Upload PDF"):
-                file_input = gr.File(label="Upload Vision 2030 PDF")
-                upload_result = gr.Textbox(label="Upload Status")
-                upload_btn = gr.Button("Process PDF")
             # Set up event handlers
             msg.submit(chat, [msg, chatbot], [chatbot, msg])
@@ -613,18 +627,19 @@ def create_gradio_interface():
             clear_btn.click(lambda: [], None, chatbot)
             feedback_btn.click(provide_feedback, [chatbot, rating, feedback_text], feedback_result)
             evaluate_btn.click(run_evaluation, None, [eval_output, eval_chart])
-            upload_btn.click(process_uploaded_file, [file_input], upload_result)
         return demo
     except Exception as e:
         logger.error(f"Error creating Gradio interface: {str(e)}")
-        # Create a simple demo for fallback
         with gr.Blocks() as demo:
             gr.Markdown("# Vision 2030 Virtual Assistant")
             gr.Markdown("There was an error initializing the assistant. Please check the logs.")
             gr.Markdown(f"Error: {str(e)}")
         return demo
-# Launch the app with proper GPU initialization
-demo = create_gradio_interface()
 demo.launch()

+# Minimal version for Hugging Face Spaces
+# File: app.py
 import gradio as gr
 import time
 import matplotlib.pyplot as plt
 from sklearn.metrics import precision_recall_fscore_support, accuracy_score
 import PyPDF2
+import io
 import json
 from langdetect import detect
 from sentence_transformers import SentenceTransformer
 )
 logger = logging.getLogger('vision2030_assistant')
+# Check for GPU availability (but don't rely on it)
 has_gpu = torch.cuda.is_available()
 logger.info(f"GPU available: {has_gpu}")
 class Vision2030Assistant:
+    def __init__(self):
+        """Initialize the Vision 2030 Assistant with embedding models and sample data"""
         logger.info("Initializing Vision 2030 Assistant...")
+        # Initialize embedding models
         self.load_embedding_models()
+        # Use sample data to start
+        self._create_sample_data()
+        self._create_indices()
+        # Create sample evaluation data
+        self._create_sample_eval_data()
+        # Initialize metrics
         self.metrics = {
             "response_times": [],
             "user_ratings": [],
             "factual_accuracy": []
         }
         self.response_history = []
     @spaces.GPU
     def load_embedding_models(self):
         """Load embedding models for retrieval with GPU support"""
+        logger.info("Loading embedding models...")
         try:
             # Load embedding models
             logger.info("Embedding models loaded successfully")
         except Exception as e:
             logger.error(f"Error loading embedding models: {str(e)}")
+            # Create simple fallback embedding method
             self._create_fallback_embedders()
     def _create_fallback_embedders(self):
         self.arabic_embedder = SimpleEmbedder()
         self.english_embedder = SimpleEmbedder()
     def _create_sample_data(self):
+        """Create sample Vision 2030 data"""
         logger.info("Creating sample Vision 2030 data")
         # English sample texts
     @spaces.GPU
     def _create_indices(self):
+        """Create FAISS indices for fast text retrieval"""
         logger.info("Creating FAISS indices for text retrieval")
         try:
                 "question": "ما هو مشروع البحر الأحمر؟",
                 "lang": "ar",
                 "reference_answer": "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
             }
         ]
         logger.info(f"Created {len(self.eval_data)} sample evaluation examples")
     @spaces.GPU
     def retrieve_context(self, query, lang):
+        """Retrieve relevant context for a query based on language"""
         start_time = time.time()
         try:
             return ""
     def generate_response(self, user_input):
+        """Generate a response to user input using retrieval and predefined responses"""
+        if not user_input or user_input.strip() == "":
+            return ""
         start_time = time.time()
         # Default response in case of failure
             # Retrieve relevant context
             context = self.retrieve_context(user_input, lang)
+            # Simplified response generation
             if lang == "ar":
                 if "ركائز" in user_input or "اركان" in user_input:
                     reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
                     reply = "تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪."
                 elif "القدية" in user_input:
                     reply = "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030."
+                elif "ماهي" in user_input or "ما هي" in user_input:
+                    reply = "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة. الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
                 else:
                     # Use the retrieved context directly if available
                     reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
                     reply = "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
                 elif "qiddiya" in user_input.lower():
                     reply = "Qiddiya is a entertainment mega-project being built in Riyadh as part of Vision 2030."
+                elif "what is" in user_input.lower():
+                    reply = "Vision 2030 is Saudi Arabia's strategic framework to reduce dependence on oil, diversify the economy, and develop public sectors. The key pillars are a vibrant society, a thriving economy, and an ambitious nation."
                 else:
                     # Use the retrieved context directly if available
                     reply = context if context else "I couldn't find enough information about this question."
     @spaces.GPU
     def evaluate_on_test_set(self):
+        """Evaluate the assistant on the test set"""
         logger.info("Running evaluation on test set")
         eval_results = []
         return True
+    @spaces.GPU
+    def process_uploaded_pdf(self, file):
+        """Process uploaded PDF and extract text content"""
+        if file is None:
+            return "No file uploaded. Please select a PDF file."
+        try:
+            logger.info(f"Processing uploaded file")
+            # Use PyPDF2 to read the file content directly
+            reader = PyPDF2.PdfReader(file)
+            # Extract text from the PDF
+            full_text = ""
+            for page_num in range(len(reader.pages)):
+                page = reader.pages[page_num]
+                extracted_text = page.extract_text()
+                if extracted_text:
+                    full_text += extracted_text + "\n"
+            if not full_text.strip():
+                return "The uploaded PDF doesn't contain extractable text. Please try another file."
+            # Process the extracted text
+            chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
+            # Categorize text by language
+            english_chunks = []
+            arabic_chunks = []
+            for chunk in chunks:
+                try:
+                    lang = detect(chunk)
+                    if lang == "ar":
+                        arabic_chunks.append(chunk)
+                    else:
+                        english_chunks.append(chunk)
+                except:
+                    # If language detection fails, assume English
+                    english_chunks.append(chunk)
+            # Update the assistant's knowledge base
+            self.english_texts = english_chunks
+            self.arabic_texts = arabic_chunks
+            # Recreate indices
+            self._create_indices()
+            logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
+            return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments."
+        except Exception as e:
+            logger.error(f"Error processing PDF: {str(e)}")
+            return f"❌ Error processing the PDF: {str(e)}. Please try another file."
 # Create the Gradio interface
+def create_interface():
     try:
         # Initialize the assistant
         assistant = Vision2030Assistant()
         def chat(message, history):
+            if not message or message.strip() == "":
                 return history, ""
             # Generate response
             fig = assistant.visualize_evaluation_results(results)
             return summary, fig
         # Create the Gradio interface
         with gr.Blocks() as demo:
                 eval_chart = gr.Plot(label="Evaluation Metrics")
             with gr.Tab("Upload PDF"):
+                gr.Markdown("""
+                ### Upload a Vision 2030 PDF Document
+                Upload a PDF document to enhance the assistant's knowledge base.
+                """)
+                with gr.Row():
+                    file_input = gr.File(
+                        label="Select PDF File",
+                        file_types=[".pdf"],
+                        type="binary"  # Important: Use binary mode
+                    )
+                with gr.Row():
+                    upload_btn = gr.Button("Process PDF", variant="primary")
+                with gr.Row():
+                    upload_status = gr.Textbox(
+                        label="Upload Status",
+                        placeholder="Upload status will appear here...",
+                        interactive=False
+                    )
+                gr.Markdown("""
+                ### Notes:
+                - The PDF should contain text that can be extracted (not scanned images)
+                - After uploading, you can return to the Chat tab to ask questions about the uploaded content
+                - If no PDF is uploaded, the assistant will use default Vision 2030 information
+                """)
             # Set up event handlers
             msg.submit(chat, [msg, chatbot], [chatbot, msg])
             clear_btn.click(lambda: [], None, chatbot)
             feedback_btn.click(provide_feedback, [chatbot, rating, feedback_text], feedback_result)
             evaluate_btn.click(run_evaluation, None, [eval_output, eval_chart])
+            upload_btn.click(assistant.process_uploaded_pdf, [file_input], [upload_status])
         return demo
     except Exception as e:
         logger.error(f"Error creating Gradio interface: {str(e)}")
+        # Create a simple fallback demo if there's an error
         with gr.Blocks() as demo:
             gr.Markdown("# Vision 2030 Virtual Assistant")
             gr.Markdown("There was an error initializing the assistant. Please check the logs.")
             gr.Markdown(f"Error: {str(e)}")
         return demo
+# Launch the app
+demo = create_interface()
 demo.launch()