Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -369,85 +369,45 @@ class Vision2030Assistant:
|
|
369 |
return ""
|
370 |
|
371 |
def generate_response(self, user_input):
|
372 |
-
"""Generate
|
373 |
if not user_input or user_input.strip() == "":
|
374 |
return ""
|
375 |
|
376 |
start_time = time.time()
|
377 |
|
378 |
-
# Default response in case of failure
|
379 |
-
default_response = {
|
380 |
-
"en": "I apologize, but I couldn't process your request properly. Please try again.",
|
381 |
-
"ar": "أعتذر، لم أتمكن من معالجة طلبك بشكل صحيح. الرجاء المحاولة مرة أخرى."
|
382 |
-
}
|
383 |
-
|
384 |
try:
|
385 |
# Detect language
|
386 |
try:
|
387 |
lang = detect(user_input)
|
388 |
-
if lang != "ar":
|
389 |
lang = "en"
|
390 |
except:
|
391 |
-
lang = "en"
|
392 |
|
393 |
-
|
394 |
-
|
395 |
-
# Check for specific question patterns
|
396 |
-
if lang == "ar":
|
397 |
-
# National identity
|
398 |
-
if "الهوية الوطنية" in user_input or "تعزيز الهوية" in user_input:
|
399 |
-
reply = "تتضمن رؤية 2030 مبادرات متعددة لتعزيز الهوية الوطنية السعودية بما في ذلك البرامج الثقافية والحفاظ على التراث وتعزيز القيم السعودية."
|
400 |
-
# Hajj and Umrah
|
401 |
-
elif "المعتمرين" in user_input or "الحجاج" in user_input or "العمرة" in user_input or "الحج" in user_input:
|
402 |
-
reply = "تهدف رؤية 2030 إلى زيادة القدرة على استقبال المعتمرين من 8 ملايين إلى 30 مليون معتمر سنويًا."
|
403 |
-
# Economic diversification
|
404 |
-
elif "تنويع مصادر الدخل" in user_input or "الاقتصاد المزدهر" in user_input or "تنمية الاقتصاد" in user_input:
|
405 |
-
reply = "تهدف رؤية 2030 إلى زيادة الإيرادات الحكومية غير النفطية من 163 مليار ريال سعودي إلى 1 تريليون ريال سعودي من خلال تطوير قطاعات متنوعة مثل السياحة والتصنيع والطاقة المتجددة."
|
406 |
-
# UNESCO sites
|
407 |
-
elif "المواقع الأثرية" in user_input or "اليونسكو" in user_input or "التراث العالمي" in user_input:
|
408 |
-
reply = "تضع رؤية 2030 هدفًا بتسجيل ما لا يقل عن 10 مواقع سعودية في قائمة التراث العالمي لليونسكو."
|
409 |
-
# Real wealth
|
410 |
-
elif "الثروة الحقيقية" in user_input or "أثمن" in user_input or "ثروة" in user_input:
|
411 |
-
reply = "الثروة الحقيقية للمملكة العربية السعودية، كما أكدت رؤية 2030، هي شعبها، وخاصة الشباب."
|
412 |
-
# Global gateway
|
413 |
-
elif "بوابة للعالم" in user_input or "مكانتها" in user_input or "موقعها الاستراتيجي" in user_input:
|
414 |
-
reply = "تهدف المملكة العربية السعودية إلى تعزيز مكانتها كبوابة عالمية من خلال الاستفادة من موقعها الاستراتيجي بين آسيا وأوروبا وأفريقيا."
|
415 |
-
# Key pillars
|
416 |
-
elif "ركائز" in user_input or "اركان" in user_input:
|
417 |
-
reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
|
418 |
-
# General Vision 2030
|
419 |
-
elif "ما هي" in user_input or "ماهي" in user_input:
|
420 |
-
reply = "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة. الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
|
421 |
-
else:
|
422 |
-
# Use retrieved context
|
423 |
-
context = self.retrieve_context(user_input, lang)
|
424 |
-
reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
|
425 |
-
else: # English
|
426 |
-
# Use retrieved context
|
427 |
context = self.retrieve_context(user_input, lang)
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
logger.error(f"Error generating response: {str(e)}")
|
450 |
-
return default_response.get(lang, default_response["en"])
|
451 |
|
452 |
def evaluate_factual_accuracy(self, response, reference):
|
453 |
"""Simple evaluation of factual accuracy by keyword matching"""
|
@@ -569,12 +529,12 @@ class Vision2030Assistant:
|
|
569 |
|
570 |
@spaces.GPU
|
571 |
def process_pdf(self, file):
|
572 |
-
"""Process uploaded PDF
|
573 |
if file is None:
|
574 |
return "No file uploaded. Please select a PDF file."
|
575 |
|
576 |
try:
|
577 |
-
logger.info(
|
578 |
|
579 |
# Convert bytes to file-like object
|
580 |
file_stream = io.BytesIO(file)
|
@@ -585,73 +545,94 @@ class Vision2030Assistant:
|
|
585 |
# Extract text from the PDF
|
586 |
full_text = ""
|
587 |
for page_num in range(len(reader.pages)):
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
|
|
|
|
|
|
592 |
|
593 |
if not full_text.strip():
|
594 |
return "The uploaded PDF doesn't contain extractable text. Please try another file."
|
|
|
|
|
|
|
|
|
|
|
595 |
|
596 |
-
# Process the extracted text
|
|
|
597 |
chunks = []
|
598 |
-
paragraphs = re.split(r'\n\s*\n', full_text)
|
599 |
|
600 |
-
|
601 |
-
|
602 |
-
|
|
|
|
|
|
|
603 |
continue
|
604 |
|
605 |
-
|
606 |
-
|
607 |
-
sentences = re.split(r'(?<=[.!?])\s+', paragraph)
|
608 |
-
current_chunk = ""
|
609 |
-
for sentence in sentences:
|
610 |
-
if len(current_chunk) + len(sentence) > 300:
|
611 |
-
if current_chunk:
|
612 |
-
chunks.append(current_chunk.strip())
|
613 |
-
current_chunk = sentence
|
614 |
-
else:
|
615 |
-
current_chunk += " " + sentence if current_chunk else sentence
|
616 |
-
|
617 |
if current_chunk:
|
618 |
chunks.append(current_chunk.strip())
|
|
|
619 |
else:
|
620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
621 |
|
622 |
-
# Categorize
|
623 |
english_chunks = []
|
624 |
arabic_chunks = []
|
625 |
|
626 |
for chunk in chunks:
|
627 |
try:
|
628 |
-
|
629 |
-
if
|
630 |
arabic_chunks.append(chunk)
|
631 |
else:
|
632 |
-
|
|
|
|
|
|
|
|
|
|
|
633 |
except:
|
634 |
-
# If
|
635 |
if any('\u0600' <= c <= '\u06FF' for c in chunk):
|
636 |
arabic_chunks.append(chunk)
|
637 |
else:
|
638 |
english_chunks.append(chunk)
|
639 |
|
640 |
-
#
|
641 |
self.pdf_english_texts = english_chunks
|
642 |
self.pdf_arabic_texts = arabic_chunks
|
643 |
|
644 |
-
# Create
|
645 |
self._create_pdf_indices()
|
646 |
|
647 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
|
649 |
-
return f"✅ Successfully processed
|
650 |
|
651 |
except Exception as e:
|
652 |
logger.error(f"Error processing PDF: {str(e)}")
|
653 |
return f"❌ Error processing the PDF: {str(e)}. Please try another file."
|
654 |
-
|
655 |
# Create the Gradio interface
|
656 |
def create_interface():
|
657 |
# Initialize the assistant
|
|
|
369 |
return ""
|
370 |
|
371 |
def generate_response(self, user_input):
|
372 |
+
"""Generate responses by prioritizing PDF content over pre-defined answers"""
|
373 |
if not user_input or user_input.strip() == "":
|
374 |
return ""
|
375 |
|
376 |
start_time = time.time()
|
377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
378 |
try:
|
379 |
# Detect language
|
380 |
try:
|
381 |
lang = detect(user_input)
|
382 |
+
if lang != "ar":
|
383 |
lang = "en"
|
384 |
except:
|
385 |
+
lang = "en"
|
386 |
|
387 |
+
# Always try to retrieve from PDF first if available
|
388 |
+
if hasattr(self, 'has_pdf_content') and self.has_pdf_content:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
context = self.retrieve_context(user_input, lang)
|
390 |
+
|
391 |
+
# If we found content in the PDF, use it directly
|
392 |
+
if context and context.strip():
|
393 |
+
logger.info("Answering from PDF content")
|
394 |
+
reply = context
|
395 |
+
|
396 |
+
# Record metrics
|
397 |
+
response_time = time.time() - start_time
|
398 |
+
self.metrics["response_times"].append(response_time)
|
399 |
+
|
400 |
+
# Store the interaction
|
401 |
+
self.response_history.append({
|
402 |
+
"timestamp": datetime.now().isoformat(),
|
403 |
+
"user_input": user_input,
|
404 |
+
"response": reply,
|
405 |
+
"language": lang,
|
406 |
+
"response_time": response_time,
|
407 |
+
"source": "PDF document"
|
408 |
+
})
|
409 |
+
|
410 |
+
return reply
|
|
|
|
|
411 |
|
412 |
def evaluate_factual_accuracy(self, response, reference):
|
413 |
"""Simple evaluation of factual accuracy by keyword matching"""
|
|
|
529 |
|
530 |
@spaces.GPU
|
531 |
def process_pdf(self, file):
|
532 |
+
"""Process uploaded PDF with focus on extracting all content for answering questions"""
|
533 |
if file is None:
|
534 |
return "No file uploaded. Please select a PDF file."
|
535 |
|
536 |
try:
|
537 |
+
logger.info("Processing uploaded PDF document")
|
538 |
|
539 |
# Convert bytes to file-like object
|
540 |
file_stream = io.BytesIO(file)
|
|
|
545 |
# Extract text from the PDF
|
546 |
full_text = ""
|
547 |
for page_num in range(len(reader.pages)):
|
548 |
+
try:
|
549 |
+
page = reader.pages[page_num]
|
550 |
+
extracted_text = page.extract_text()
|
551 |
+
if extracted_text:
|
552 |
+
full_text += extracted_text + "\n"
|
553 |
+
except Exception as e:
|
554 |
+
logger.error(f"Error extracting text from page {page_num}: {str(e)}")
|
555 |
|
556 |
if not full_text.strip():
|
557 |
return "The uploaded PDF doesn't contain extractable text. Please try another file."
|
558 |
+
|
559 |
+
# First remove existing PDF content
|
560 |
+
self.pdf_english_texts = []
|
561 |
+
self.pdf_arabic_texts = []
|
562 |
+
self.has_pdf_content = False
|
563 |
|
564 |
+
# Process the extracted text into meaningful chunks
|
565 |
+
# Default chunk size of ~200-300 characters for better semantic indexing
|
566 |
chunks = []
|
|
|
567 |
|
568 |
+
# Using sentences as more meaningful units than arbitrary chunks
|
569 |
+
sentences = re.split(r'(?<=[.!?])\s+', full_text)
|
570 |
+
current_chunk = ""
|
571 |
+
|
572 |
+
for sentence in sentences:
|
573 |
+
if not sentence.strip():
|
574 |
continue
|
575 |
|
576 |
+
# If adding this sentence would make chunk too big, save current and start new
|
577 |
+
if len(current_chunk) + len(sentence) > 300:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
578 |
if current_chunk:
|
579 |
chunks.append(current_chunk.strip())
|
580 |
+
current_chunk = sentence
|
581 |
else:
|
582 |
+
current_chunk += " " + sentence if current_chunk else sentence
|
583 |
+
|
584 |
+
# Add the last chunk if any
|
585 |
+
if current_chunk:
|
586 |
+
chunks.append(current_chunk.strip())
|
587 |
+
|
588 |
+
# Filter out very short chunks (likely noise)
|
589 |
+
chunks = [chunk for chunk in chunks if len(chunk.strip()) > 30]
|
590 |
|
591 |
+
# Categorize by language with focus on accurate detection
|
592 |
english_chunks = []
|
593 |
arabic_chunks = []
|
594 |
|
595 |
for chunk in chunks:
|
596 |
try:
|
597 |
+
# Check for Arabic characters first (more reliable)
|
598 |
+
if any('\u0600' <= c <= '\u06FF' for c in chunk):
|
599 |
arabic_chunks.append(chunk)
|
600 |
else:
|
601 |
+
# Use language detection as backup
|
602 |
+
lang = detect(chunk)
|
603 |
+
if lang == "ar":
|
604 |
+
arabic_chunks.append(chunk)
|
605 |
+
else:
|
606 |
+
english_chunks.append(chunk)
|
607 |
except:
|
608 |
+
# If detection fails, check for Arabic characters
|
609 |
if any('\u0600' <= c <= '\u06FF' for c in chunk):
|
610 |
arabic_chunks.append(chunk)
|
611 |
else:
|
612 |
english_chunks.append(chunk)
|
613 |
|
614 |
+
# Replace PDF content with new content
|
615 |
self.pdf_english_texts = english_chunks
|
616 |
self.pdf_arabic_texts = arabic_chunks
|
617 |
|
618 |
+
# Create high-quality embeddings - this is critical for accurate retrieval
|
619 |
self._create_pdf_indices()
|
620 |
|
621 |
+
# Mark system to prioritize document content over pre-defined answers
|
622 |
+
self.has_pdf_content = True
|
623 |
+
self.prioritize_pdf_content = True
|
624 |
+
|
625 |
+
logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic and {len(english_chunks)} English segments")
|
626 |
+
|
627 |
+
# Also modify the retrieval threshold to ensure better matches
|
628 |
+
self.pdf_relevance_threshold = 1.2 # Lower threshold = stricter matching
|
629 |
|
630 |
+
return f"✅ Successfully processed your PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. The system will now answer questions directly from your document content."
|
631 |
|
632 |
except Exception as e:
|
633 |
logger.error(f"Error processing PDF: {str(e)}")
|
634 |
return f"❌ Error processing the PDF: {str(e)}. Please try another file."
|
635 |
+
|
636 |
# Create the Gradio interface
|
637 |
def create_interface():
|
638 |
# Initialize the assistant
|