Spaces:

abdull4h
/

vision-2030-virtual-assistant

Running on Zero

App Files Files Community

abdull4h commited on Mar 20

Commit

cf43777

verified ·

1 Parent(s): 1a7b773

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -100

app.py CHANGED Viewed

@@ -369,85 +369,45 @@ class Vision2030Assistant:
             return ""
     def generate_response(self, user_input):
-        """Generate response based on user input"""
         if not user_input or user_input.strip() == "":
             return ""
         start_time = time.time()
-        # Default response in case of failure
-        default_response = {
-            "en": "I apologize, but I couldn't process your request properly. Please try again.",
-            "ar": "أعتذر، لم أتمكن من معالجة طلبك بشكل صحيح. الرجاء المحاولة مرة أخرى."
-        }
         try:
             # Detect language
             try:
                 lang = detect(user_input)
-                if lang != "ar":  # Simplify to just Arabic vs non-Arabic
                     lang = "en"
             except:
-                lang = "en"  # Default fallback
-            logger.info(f"Detected language: {lang}")
-            # Check for specific question patterns
-            if lang == "ar":
-                # National identity
-                if "الهوية الوطنية" in user_input or "تعزيز الهوية" in user_input:
-                    reply = "تتضمن رؤية 2030 مبادرات متعددة لتعزيز الهوية الوطنية السعودية بما في ذلك البرامج الثقافية والحفاظ على التراث وتعزيز القيم السعودية."
-                # Hajj and Umrah
-                elif "المعتمرين" in user_input or "الحجاج" in user_input or "العمرة" in user_input or "الحج" in user_input:
-                    reply = "تهدف رؤية 2030 إلى زيادة القدرة على استقبال المعتمرين من 8 ملايين إلى 30 مليون معتمر سنويًا."
-                # Economic diversification
-                elif "تنويع مصادر الدخل" in user_input or "الاقتصاد المزدهر" in user_input or "تنمية الاقتصاد" in user_input:
-                    reply = "تهدف رؤية 2030 إلى زيادة الإيرادات الحكومية غير النفطية من 163 مليار ريال سعودي إلى 1 تريليون ريال سعودي من خلال تطوير قطاعات متنوعة مثل السياحة والتصنيع والطاقة المتجددة."
-                # UNESCO sites
-                elif "المواقع الأثرية" in user_input or "اليونسكو" in user_input or "التراث العالمي" in user_input:
-                    reply = "تضع رؤية 2030 هدفًا بتسجيل ما لا يقل عن 10 مواقع سعودية في قائمة التراث العالمي لليونسكو."
-                # Real wealth
-                elif "الثروة الحقيقية" in user_input or "أثمن" in user_input or "ثروة" in user_input:
-                    reply = "الثروة الحقيقية للمملكة العربية السعودية، كما أكدت رؤية 2030، هي شعبها، وخاصة الشباب."
-                # Global gateway
-                elif "بوابة للعالم" in user_input or "مكانتها" in user_input or "موقعها الاستراتيجي" in user_input:
-                    reply = "تهدف المملكة العربية السعودية إلى تعزيز مكانتها كبوابة عالمية من خلال الاستفادة من موقعها الاستراتيجي بين آسيا وأوروبا وأفريقيا."
-                # Key pillars
-                elif "ركائز" in user_input or "اركان" in user_input:
-                    reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
-                # General Vision 2030
-                elif "ما هي" in user_input or "ماهي" in user_input:
-                    reply = "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة. الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
-                else:
-                    # Use retrieved context
-                    context = self.retrieve_context(user_input, lang)
-                    reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
-            else:  # English
-                # Use retrieved context
                 context = self.retrieve_context(user_input, lang)
-                reply = context if context else "I couldn't find enough information about this question."
-            # Record response time
-            response_time = time.time() - start_time
-            self.metrics["response_times"].append(response_time)
-            logger.info(f"Generated response in {response_time:.2f}s")
-            # Store the interaction for later evaluation
-            interaction = {
-                "timestamp": datetime.now().isoformat(),
-                "user_input": user_input,
-                "response": reply,
-                "language": lang,
-                "response_time": response_time
-            }
-            self.response_history.append(interaction)
-            return reply
-        except Exception as e:
-            logger.error(f"Error generating response: {str(e)}")
-            return default_response.get(lang, default_response["en"])
     def evaluate_factual_accuracy(self, response, reference):
         """Simple evaluation of factual accuracy by keyword matching"""
@@ -569,12 +529,12 @@ class Vision2030Assistant:
     @spaces.GPU
     def process_pdf(self, file):
-        """Process uploaded PDF file"""
         if file is None:
             return "No file uploaded. Please select a PDF file."
         try:
-            logger.info(f"Processing uploaded file")
             # Convert bytes to file-like object
             file_stream = io.BytesIO(file)
@@ -585,73 +545,94 @@ class Vision2030Assistant:
             # Extract text from the PDF
             full_text = ""
             for page_num in range(len(reader.pages)):
-                page = reader.pages[page_num]
-                extracted_text = page.extract_text()
-                if extracted_text:
-                    full_text += extracted_text + "\n"
             if not full_text.strip():
                 return "The uploaded PDF doesn't contain extractable text. Please try another file."
-            # Process the extracted text with better chunking
             chunks = []
-            paragraphs = re.split(r'\n\s*\n', full_text)
-            for paragraph in paragraphs:
-                # Skip very short paragraphs
-                if len(paragraph.strip()) < 20:
                     continue
-                if len(paragraph) > 500:  # For very long paragraphs
-                    # Split into smaller chunks
-                    sentences = re.split(r'(?<=[.!?])\s+', paragraph)
-                    current_chunk = ""
-                    for sentence in sentences:
-                        if len(current_chunk) + len(sentence) > 300:
-                            if current_chunk:
-                                chunks.append(current_chunk.strip())
-                            current_chunk = sentence
-                        else:
-                            current_chunk += " " + sentence if current_chunk else sentence
                     if current_chunk:
                         chunks.append(current_chunk.strip())
                 else:
-                    chunks.append(paragraph.strip())
-            # Categorize text by language
             english_chunks = []
             arabic_chunks = []
             for chunk in chunks:
                 try:
-                    lang = detect(chunk)
-                    if lang == "ar":
                         arabic_chunks.append(chunk)
                     else:
-                        english_chunks.append(chunk)
                 except:
-                    # If language detection fails, check for Arabic characters
                     if any('\u0600' <= c <= '\u06FF' for c in chunk):
                         arabic_chunks.append(chunk)
                     else:
                         english_chunks.append(chunk)
-            # Store PDF content
             self.pdf_english_texts = english_chunks
             self.pdf_arabic_texts = arabic_chunks
-            # Create indices for PDF content
             self._create_pdf_indices()
-            logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
-            return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. PDF content will now be prioritized when answering questions."
         except Exception as e:
             logger.error(f"Error processing PDF: {str(e)}")
             return f"❌ Error processing the PDF: {str(e)}. Please try another file."
 # Create the Gradio interface
 def create_interface():
     # Initialize the assistant

             return ""
     def generate_response(self, user_input):
+        """Generate responses by prioritizing PDF content over pre-defined answers"""
         if not user_input or user_input.strip() == "":
             return ""
         start_time = time.time()
         try:
             # Detect language
             try:
                 lang = detect(user_input)
+                if lang != "ar":
                     lang = "en"
             except:
+                lang = "en"
+            # Always try to retrieve from PDF first if available
+            if hasattr(self, 'has_pdf_content') and self.has_pdf_content:
                 context = self.retrieve_context(user_input, lang)
+                # If we found content in the PDF, use it directly
+                if context and context.strip():
+                    logger.info("Answering from PDF content")
+                    reply = context
+                    # Record metrics
+                    response_time = time.time() - start_time
+                    self.metrics["response_times"].append(response_time)
+                    # Store the interaction
+                    self.response_history.append({
+                        "timestamp": datetime.now().isoformat(),
+                        "user_input": user_input,
+                        "response": reply,
+                        "language": lang,
+                        "response_time": response_time,
+                        "source": "PDF document"
+                    })
+                    return reply
     def evaluate_factual_accuracy(self, response, reference):
         """Simple evaluation of factual accuracy by keyword matching"""
     @spaces.GPU
     def process_pdf(self, file):
+        """Process uploaded PDF with focus on extracting all content for answering questions"""
         if file is None:
             return "No file uploaded. Please select a PDF file."
         try:
+            logger.info("Processing uploaded PDF document")
             # Convert bytes to file-like object
             file_stream = io.BytesIO(file)
             # Extract text from the PDF
             full_text = ""
             for page_num in range(len(reader.pages)):
+                try:
+                    page = reader.pages[page_num]
+                    extracted_text = page.extract_text()
+                    if extracted_text:
+                        full_text += extracted_text + "\n"
+                except Exception as e:
+                    logger.error(f"Error extracting text from page {page_num}: {str(e)}")
             if not full_text.strip():
                 return "The uploaded PDF doesn't contain extractable text. Please try another file."
+            # First remove existing PDF content
+            self.pdf_english_texts = []
+            self.pdf_arabic_texts = []
+            self.has_pdf_content = False
+            # Process the extracted text into meaningful chunks
+            # Default chunk size of ~200-300 characters for better semantic indexing
             chunks = []
+            # Using sentences as more meaningful units than arbitrary chunks
+            sentences = re.split(r'(?<=[.!?])\s+', full_text)
+            current_chunk = ""
+            for sentence in sentences:
+                if not sentence.strip():
                     continue
+                # If adding this sentence would make chunk too big, save current and start new
+                if len(current_chunk) + len(sentence) > 300:
                     if current_chunk:
                         chunks.append(current_chunk.strip())
+                    current_chunk = sentence
                 else:
+                    current_chunk += " " + sentence if current_chunk else sentence
+            # Add the last chunk if any
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            # Filter out very short chunks (likely noise)
+            chunks = [chunk for chunk in chunks if len(chunk.strip()) > 30]
+            # Categorize by language with focus on accurate detection
             english_chunks = []
             arabic_chunks = []
             for chunk in chunks:
                 try:
+                    # Check for Arabic characters first (more reliable)
+                    if any('\u0600' <= c <= '\u06FF' for c in chunk):
                         arabic_chunks.append(chunk)
                     else:
+                        # Use language detection as backup
+                        lang = detect(chunk)
+                        if lang == "ar":
+                            arabic_chunks.append(chunk)
+                        else:
+                            english_chunks.append(chunk)
                 except:
+                    # If detection fails, check for Arabic characters
                     if any('\u0600' <= c <= '\u06FF' for c in chunk):
                         arabic_chunks.append(chunk)
                     else:
                         english_chunks.append(chunk)
+            # Replace PDF content with new content
             self.pdf_english_texts = english_chunks
             self.pdf_arabic_texts = arabic_chunks
+            # Create high-quality embeddings - this is critical for accurate retrieval
             self._create_pdf_indices()
+            # Mark system to prioritize document content over pre-defined answers
+            self.has_pdf_content = True
+            self.prioritize_pdf_content = True
+            logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic and {len(english_chunks)} English segments")
+            # Also modify the retrieval threshold to ensure better matches
+            self.pdf_relevance_threshold = 1.2  # Lower threshold = stricter matching
+            return f"✅ Successfully processed your PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. The system will now answer questions directly from your document content."
         except Exception as e:
             logger.error(f"Error processing PDF: {str(e)}")
             return f"❌ Error processing the PDF: {str(e)}. Please try another file."
 # Create the Gradio interface
 def create_interface():
     # Initialize the assistant