abdull4h commited on
Commit
cf43777
·
verified ·
1 Parent(s): 1a7b773

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -100
app.py CHANGED
@@ -369,85 +369,45 @@ class Vision2030Assistant:
369
  return ""
370
 
371
  def generate_response(self, user_input):
372
- """Generate response based on user input"""
373
  if not user_input or user_input.strip() == "":
374
  return ""
375
 
376
  start_time = time.time()
377
 
378
- # Default response in case of failure
379
- default_response = {
380
- "en": "I apologize, but I couldn't process your request properly. Please try again.",
381
- "ar": "أعتذر، لم أتمكن من معالجة طلبك بشكل صحيح. الرجاء المحاولة مرة أخرى."
382
- }
383
-
384
  try:
385
  # Detect language
386
  try:
387
  lang = detect(user_input)
388
- if lang != "ar": # Simplify to just Arabic vs non-Arabic
389
  lang = "en"
390
  except:
391
- lang = "en" # Default fallback
392
 
393
- logger.info(f"Detected language: {lang}")
394
-
395
- # Check for specific question patterns
396
- if lang == "ar":
397
- # National identity
398
- if "الهوية الوطنية" in user_input or "تعزيز الهوية" in user_input:
399
- reply = "تتضمن رؤية 2030 مبادرات متعددة لتعزيز الهوية الوطنية السعودية بما في ذلك البرامج الثقافية والحفاظ على التراث وتعزيز القيم السعودية."
400
- # Hajj and Umrah
401
- elif "المعتمرين" in user_input or "الحجاج" in user_input or "العمرة" in user_input or "الحج" in user_input:
402
- reply = "تهدف رؤية 2030 إلى زيادة القدرة على استقبال المعتمرين من 8 ملايين إلى 30 مليون معتمر سنويًا."
403
- # Economic diversification
404
- elif "تنويع مصادر الدخل" in user_input or "الاقتصاد المزدهر" in user_input or "تنمية الاقتصاد" in user_input:
405
- reply = "تهدف رؤية 2030 إلى زيادة الإيرادات الحكومية غير النفطية من 163 مليار ريال سعودي إلى 1 تريليون ريال سعودي من خلال تطوير قطاعات متنوعة مثل السياحة والتصنيع والطاقة المتجددة."
406
- # UNESCO sites
407
- elif "المواقع الأثرية" in user_input or "اليونسكو" in user_input or "التراث العالمي" in user_input:
408
- reply = "تضع رؤية 2030 هدفًا بتسجيل ما لا يقل عن 10 مواقع سعودية في قائمة التراث العالمي لليونسكو."
409
- # Real wealth
410
- elif "الثروة الحقيقية" in user_input or "أثمن" in user_input or "ثروة" in user_input:
411
- reply = "الثروة الحقيقية للمملكة العربية السعودية، كما أكدت رؤية 2030، هي شعبها، وخاصة الشباب."
412
- # Global gateway
413
- elif "بوابة للعالم" in user_input or "مكانتها" in user_input or "موقعها الاستراتيجي" in user_input:
414
- reply = "تهدف المملكة العربية السعودية إلى تعزيز مكانتها كبوابة عالمية من خلال الاستفادة من موقعها الاستراتيجي بين آسيا وأوروبا وأفريقيا."
415
- # Key pillars
416
- elif "ركائز" in user_input or "اركان" in user_input:
417
- reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
418
- # General Vision 2030
419
- elif "ما هي" in user_input or "ماهي" in user_input:
420
- reply = "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة. الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
421
- else:
422
- # Use retrieved context
423
- context = self.retrieve_context(user_input, lang)
424
- reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
425
- else: # English
426
- # Use retrieved context
427
  context = self.retrieve_context(user_input, lang)
428
- reply = context if context else "I couldn't find enough information about this question."
429
-
430
- # Record response time
431
- response_time = time.time() - start_time
432
- self.metrics["response_times"].append(response_time)
433
-
434
- logger.info(f"Generated response in {response_time:.2f}s")
435
-
436
- # Store the interaction for later evaluation
437
- interaction = {
438
- "timestamp": datetime.now().isoformat(),
439
- "user_input": user_input,
440
- "response": reply,
441
- "language": lang,
442
- "response_time": response_time
443
- }
444
- self.response_history.append(interaction)
445
-
446
- return reply
447
-
448
- except Exception as e:
449
- logger.error(f"Error generating response: {str(e)}")
450
- return default_response.get(lang, default_response["en"])
451
 
452
  def evaluate_factual_accuracy(self, response, reference):
453
  """Simple evaluation of factual accuracy by keyword matching"""
@@ -569,12 +529,12 @@ class Vision2030Assistant:
569
 
570
  @spaces.GPU
571
  def process_pdf(self, file):
572
- """Process uploaded PDF file"""
573
  if file is None:
574
  return "No file uploaded. Please select a PDF file."
575
 
576
  try:
577
- logger.info(f"Processing uploaded file")
578
 
579
  # Convert bytes to file-like object
580
  file_stream = io.BytesIO(file)
@@ -585,73 +545,94 @@ class Vision2030Assistant:
585
  # Extract text from the PDF
586
  full_text = ""
587
  for page_num in range(len(reader.pages)):
588
- page = reader.pages[page_num]
589
- extracted_text = page.extract_text()
590
- if extracted_text:
591
- full_text += extracted_text + "\n"
 
 
 
592
 
593
  if not full_text.strip():
594
  return "The uploaded PDF doesn't contain extractable text. Please try another file."
 
 
 
 
 
595
 
596
- # Process the extracted text with better chunking
 
597
  chunks = []
598
- paragraphs = re.split(r'\n\s*\n', full_text)
599
 
600
- for paragraph in paragraphs:
601
- # Skip very short paragraphs
602
- if len(paragraph.strip()) < 20:
 
 
 
603
  continue
604
 
605
- if len(paragraph) > 500: # For very long paragraphs
606
- # Split into smaller chunks
607
- sentences = re.split(r'(?<=[.!?])\s+', paragraph)
608
- current_chunk = ""
609
- for sentence in sentences:
610
- if len(current_chunk) + len(sentence) > 300:
611
- if current_chunk:
612
- chunks.append(current_chunk.strip())
613
- current_chunk = sentence
614
- else:
615
- current_chunk += " " + sentence if current_chunk else sentence
616
-
617
  if current_chunk:
618
  chunks.append(current_chunk.strip())
 
619
  else:
620
- chunks.append(paragraph.strip())
 
 
 
 
 
 
 
621
 
622
- # Categorize text by language
623
  english_chunks = []
624
  arabic_chunks = []
625
 
626
  for chunk in chunks:
627
  try:
628
- lang = detect(chunk)
629
- if lang == "ar":
630
  arabic_chunks.append(chunk)
631
  else:
632
- english_chunks.append(chunk)
 
 
 
 
 
633
  except:
634
- # If language detection fails, check for Arabic characters
635
  if any('\u0600' <= c <= '\u06FF' for c in chunk):
636
  arabic_chunks.append(chunk)
637
  else:
638
  english_chunks.append(chunk)
639
 
640
- # Store PDF content
641
  self.pdf_english_texts = english_chunks
642
  self.pdf_arabic_texts = arabic_chunks
643
 
644
- # Create indices for PDF content
645
  self._create_pdf_indices()
646
 
647
- logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
 
 
 
 
 
 
 
648
 
649
- return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. PDF content will now be prioritized when answering questions."
650
 
651
  except Exception as e:
652
  logger.error(f"Error processing PDF: {str(e)}")
653
  return f"❌ Error processing the PDF: {str(e)}. Please try another file."
654
-
655
  # Create the Gradio interface
656
  def create_interface():
657
  # Initialize the assistant
 
369
  return ""
370
 
371
  def generate_response(self, user_input):
372
+ """Generate responses by prioritizing PDF content over pre-defined answers"""
373
  if not user_input or user_input.strip() == "":
374
  return ""
375
 
376
  start_time = time.time()
377
 
 
 
 
 
 
 
378
  try:
379
  # Detect language
380
  try:
381
  lang = detect(user_input)
382
+ if lang != "ar":
383
  lang = "en"
384
  except:
385
+ lang = "en"
386
 
387
+ # Always try to retrieve from PDF first if available
388
+ if hasattr(self, 'has_pdf_content') and self.has_pdf_content:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  context = self.retrieve_context(user_input, lang)
390
+
391
+ # If we found content in the PDF, use it directly
392
+ if context and context.strip():
393
+ logger.info("Answering from PDF content")
394
+ reply = context
395
+
396
+ # Record metrics
397
+ response_time = time.time() - start_time
398
+ self.metrics["response_times"].append(response_time)
399
+
400
+ # Store the interaction
401
+ self.response_history.append({
402
+ "timestamp": datetime.now().isoformat(),
403
+ "user_input": user_input,
404
+ "response": reply,
405
+ "language": lang,
406
+ "response_time": response_time,
407
+ "source": "PDF document"
408
+ })
409
+
410
+ return reply
 
 
411
 
412
  def evaluate_factual_accuracy(self, response, reference):
413
  """Simple evaluation of factual accuracy by keyword matching"""
 
529
 
530
  @spaces.GPU
531
  def process_pdf(self, file):
532
+ """Process uploaded PDF with focus on extracting all content for answering questions"""
533
  if file is None:
534
  return "No file uploaded. Please select a PDF file."
535
 
536
  try:
537
+ logger.info("Processing uploaded PDF document")
538
 
539
  # Convert bytes to file-like object
540
  file_stream = io.BytesIO(file)
 
545
  # Extract text from the PDF
546
  full_text = ""
547
  for page_num in range(len(reader.pages)):
548
+ try:
549
+ page = reader.pages[page_num]
550
+ extracted_text = page.extract_text()
551
+ if extracted_text:
552
+ full_text += extracted_text + "\n"
553
+ except Exception as e:
554
+ logger.error(f"Error extracting text from page {page_num}: {str(e)}")
555
 
556
  if not full_text.strip():
557
  return "The uploaded PDF doesn't contain extractable text. Please try another file."
558
+
559
+ # First remove existing PDF content
560
+ self.pdf_english_texts = []
561
+ self.pdf_arabic_texts = []
562
+ self.has_pdf_content = False
563
 
564
+ # Process the extracted text into meaningful chunks
565
+ # Default chunk size of ~200-300 characters for better semantic indexing
566
  chunks = []
 
567
 
568
+ # Using sentences as more meaningful units than arbitrary chunks
569
+ sentences = re.split(r'(?<=[.!?])\s+', full_text)
570
+ current_chunk = ""
571
+
572
+ for sentence in sentences:
573
+ if not sentence.strip():
574
  continue
575
 
576
+ # If adding this sentence would make chunk too big, save current and start new
577
+ if len(current_chunk) + len(sentence) > 300:
 
 
 
 
 
 
 
 
 
 
578
  if current_chunk:
579
  chunks.append(current_chunk.strip())
580
+ current_chunk = sentence
581
  else:
582
+ current_chunk += " " + sentence if current_chunk else sentence
583
+
584
+ # Add the last chunk if any
585
+ if current_chunk:
586
+ chunks.append(current_chunk.strip())
587
+
588
+ # Filter out very short chunks (likely noise)
589
+ chunks = [chunk for chunk in chunks if len(chunk.strip()) > 30]
590
 
591
+ # Categorize by language with focus on accurate detection
592
  english_chunks = []
593
  arabic_chunks = []
594
 
595
  for chunk in chunks:
596
  try:
597
+ # Check for Arabic characters first (more reliable)
598
+ if any('\u0600' <= c <= '\u06FF' for c in chunk):
599
  arabic_chunks.append(chunk)
600
  else:
601
+ # Use language detection as backup
602
+ lang = detect(chunk)
603
+ if lang == "ar":
604
+ arabic_chunks.append(chunk)
605
+ else:
606
+ english_chunks.append(chunk)
607
  except:
608
+ # If detection fails, check for Arabic characters
609
  if any('\u0600' <= c <= '\u06FF' for c in chunk):
610
  arabic_chunks.append(chunk)
611
  else:
612
  english_chunks.append(chunk)
613
 
614
+ # Replace PDF content with new content
615
  self.pdf_english_texts = english_chunks
616
  self.pdf_arabic_texts = arabic_chunks
617
 
618
+ # Create high-quality embeddings - this is critical for accurate retrieval
619
  self._create_pdf_indices()
620
 
621
+ # Mark system to prioritize document content over pre-defined answers
622
+ self.has_pdf_content = True
623
+ self.prioritize_pdf_content = True
624
+
625
+ logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic and {len(english_chunks)} English segments")
626
+
627
+ # Also modify the retrieval threshold to ensure better matches
628
+ self.pdf_relevance_threshold = 1.2 # Lower threshold = stricter matching
629
 
630
+ return f"✅ Successfully processed your PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. The system will now answer questions directly from your document content."
631
 
632
  except Exception as e:
633
  logger.error(f"Error processing PDF: {str(e)}")
634
  return f"❌ Error processing the PDF: {str(e)}. Please try another file."
635
+
636
  # Create the Gradio interface
637
  def create_interface():
638
  # Initialize the assistant