# Vision 2030 Virtual Assistant with Arabic (ALLaM-7B) and English (Mistral-7B-Instruct) + RAG + Evaluation Framework
"""
Enhanced implementation of the Vision 2030 Virtual Assistant that meets all project requirements:
1. Implements proper NLP task structure (bilingual QA system)
2. Adds comprehensive evaluation framework for quantitative and qualitative assessment
3. Improves RAG implementation with better retrieval and document processing
4. Adds user feedback collection for continuous improvement
5. Includes structured logging and performance monitoring
"""

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langdetect import detect
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
import time
import logging
import os
import re
from datetime import datetime
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import pandas as pd
import matplotlib.pyplot as plt
import PyPDF2
import io

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("vision2030_assistant.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger('vision2030_assistant')

class Vision2030Assistant:
    def __init__(self, pdf_path="vision2030.pdf", eval_data_path="evaluation_data.json"):
        """
        Initialize the Vision 2030 Assistant with models, knowledge base, and evaluation framework
        
        Args:
            pdf_path: Path to the Vision 2030 PDF document
            eval_data_path: Path to evaluation dataset
        """
        logger.info("Initializing Vision 2030 Assistant...")
        self.load_models()
        self.load_and_process_documents(pdf_path)
        self.setup_evaluation_framework(eval_data_path)
        self.response_history = []
        logger.info("Vision 2030 Assistant initialized successfully")
    
    def load_models(self):
        """Load language models and embedding models for both Arabic and English"""
        logger.info("Loading language and embedding models...")
        
        # Load Arabic Model (ALLaM-7B)
        try:
            self.arabic_model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"
            self.arabic_tokenizer = AutoTokenizer.from_pretrained(self.arabic_model_id)
            self.arabic_model = AutoModelForCausalLM.from_pretrained(self.arabic_model_id, device_map="auto")
            self.arabic_pipe = pipeline("text-generation", model=self.arabic_model, tokenizer=self.arabic_tokenizer)
            logger.info("Arabic model loaded successfully")
        except Exception as e:
            logger.error(f"Error loading Arabic model: {str(e)}")
            raise
        
        # Load English Model (Mistral-7B-Instruct)
        try:
            self.english_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
            self.english_tokenizer = AutoTokenizer.from_pretrained(self.english_model_id)
            self.english_model = AutoModelForCausalLM.from_pretrained(self.english_model_id, device_map="auto")
            self.english_pipe = pipeline("text-generation", model=self.english_model, tokenizer=self.english_tokenizer)
            logger.info("English model loaded successfully")
        except Exception as e:
            logger.error(f"Error loading English model: {str(e)}")
            raise
        
        # Load Embedding Models for Retrieval
        try:
            self.arabic_embedder = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-ca')
            self.english_embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
            logger.info("Embedding models loaded successfully")
        except Exception as e:
            logger.error(f"Error loading embedding models: {str(e)}")
            raise

    def load_and_process_documents(self, pdf_path):
        """Load and process the Vision 2030 document from PDF"""
        logger.info(f"Processing Vision 2030 document from {pdf_path}")
        
        # Initialize empty document lists
        self.english_texts = []
        self.arabic_texts = []
        
        try:
            # Check if PDF exists
            if os.path.exists(pdf_path):
                # Extract text from PDF
                with open(pdf_path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    full_text = ""
                    for page_num in range(len(reader.pages)):
                        page = reader.pages[page_num]
                        full_text += page.extract_text() + "\n"
                
                # Split into chunks (simple approach - could be improved with better text segmentation)
                chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
                
                # Detect language and add to appropriate list
                for chunk in chunks:
                    try:
                        lang = detect(chunk)
                        if lang == "ar":
                            self.arabic_texts.append(chunk)
                        else:  # Default to English for other languages
                            self.english_texts.append(chunk)
                    except:
                        # If language detection fails, assume English
                        self.english_texts.append(chunk)
                
                logger.info(f"Processed {len(self.arabic_texts)} Arabic and {len(self.english_texts)} English chunks")
            else:
                logger.warning(f"PDF file not found at {pdf_path}. Using fallback sample data.")
                self._create_sample_data()
        except Exception as e:
            logger.error(f"Error processing PDF: {str(e)}")
            logger.info("Using fallback sample data")
            self._create_sample_data()
        
        # Create FAISS indices
        self._create_indices()

    def _create_sample_data(self):
        """Create sample Vision 2030 data if PDF processing fails"""
        logger.info("Creating sample Vision 2030 data")
        
        # English sample texts
        self.english_texts = [
            "Vision 2030 is Saudi Arabia's strategic framework to reduce dependence on oil, diversify the economy, and develop public sectors.",
            "The key pillars of Vision 2030 are a vibrant society, a thriving economy, and an ambitious nation.",
            "The Saudi Public Investment Fund (PIF) plays a crucial role in Vision 2030 by investing in strategic sectors.",
            "NEOM is a planned cross-border smart city in the Tabuk Province of northwestern Saudi Arabia, a key project of Vision 2030.",
            "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%.",
            "The Red Sea Project is a Vision 2030 initiative to develop luxury tourism destinations across 50 islands off Saudi Arabia's Red Sea coast.",
            "Qiddiya is a entertainment mega-project being built in Riyadh as part of Vision 2030.",
            "Vision 2030 targets increasing the private sector's contribution to GDP from 40% to 65%.",
            "One goal of Vision 2030 is to increase foreign direct investment from 3.8% to 5.7% of GDP.",
            "Vision 2030 includes plans to develop the digital infrastructure and support for tech startups in Saudi Arabia."
        ]
        
        # Arabic sample texts (same content as English)
        self.arabic_texts = [
            "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة.",
            "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح.",
            "يلعب صندوق الاستثمارات العامة السعودي دورًا محوريًا في رؤية 2030 من خلال الاستثمار في القطاعات الاستراتيجية.",
            "نيوم هي مدينة ذكية مخططة عبر الحدود في مقاطعة تبوك شمال غرب المملكة العربية السعودية، وهي مشروع رئيسي من رؤية 2030.",
            "تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪.",
            "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي.",
            "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030.",
            "تستهدف رؤية 2030 زيادة مساهمة القطاع الخاص في الناتج المحلي الإجمالي من 40٪ إلى 65٪.",
            "أحد أهداف رؤية 2030 هو زيادة الاستثمار الأجنبي المباشر من 3.8٪ إلى 5.7٪ من الناتج المحلي الإجمالي.",
            "تتضمن رؤية 2030 خططًا لتطوير البنية التحتية الرقمية والدعم للشركات الناشئة التكنولوجية في المملكة العربية السعودية."
        ]

    def _create_indices(self):
        """Create FAISS indices for fast text retrieval"""
        logger.info("Creating FAISS indices for text retrieval")
        
        try:
            # Process and embed English texts
            self.english_vectors = []
            for text in self.english_texts:
                vec = self.english_embedder.encode(text)
                self.english_vectors.append(vec)
            
            # Create English index
            if self.english_vectors:
                self.english_index = faiss.IndexFlatL2(len(self.english_vectors[0]))
                self.english_index.add(np.array(self.english_vectors))
                logger.info(f"Created English index with {len(self.english_vectors)} vectors")
            else:
                logger.warning("No English texts to index")
            
            # Process and embed Arabic texts
            self.arabic_vectors = []
            for text in self.arabic_texts:
                vec = self.arabic_embedder.encode(text)
                self.arabic_vectors.append(vec)
            
            # Create Arabic index
            if self.arabic_vectors:
                self.arabic_index = faiss.IndexFlatL2(len(self.arabic_vectors[0]))
                self.arabic_index.add(np.array(self.arabic_vectors))
                logger.info(f"Created Arabic index with {len(self.arabic_vectors)} vectors")
            else:
                logger.warning("No Arabic texts to index")
                
        except Exception as e:
            logger.error(f"Error creating FAISS indices: {str(e)}")
            raise

    def setup_evaluation_framework(self, eval_data_path):
        """Set up the evaluation framework with test data and metrics"""
        logger.info("Setting up evaluation framework")
        
        # Initialize metrics trackers
        self.metrics = {
            "response_times": [],
            "user_ratings": [],
            "retrieval_precision": [],
            "factual_accuracy": []
        }
        
        # Load evaluation data if exists, otherwise create sample
        try:
            if os.path.exists(eval_data_path):
                with open(eval_data_path, 'r', encoding='utf-8') as f:
                    self.eval_data = json.load(f)
                logger.info(f"Loaded {len(self.eval_data)} evaluation examples from {eval_data_path}")
            else:
                logger.warning(f"Evaluation data not found at {eval_data_path}. Creating sample evaluation data.")
                self._create_sample_eval_data()
        except Exception as e:
            logger.error(f"Error loading evaluation data: {str(e)}")
            self._create_sample_eval_data()
    
    def _create_sample_eval_data(self):
        """Create sample evaluation data with ground truth"""
        self.eval_data = [
            {
                "question": "What are the key pillars of Vision 2030?",
                "lang": "en",
                "reference_answer": "The key pillars of Vision 2030 are a vibrant society, a thriving economy, and an ambitious nation."
            },
            {
                "question": "ما هي الركائز الرئيسية لرؤية 2030؟",
                "lang": "ar",
                "reference_answer": "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
            },
            {
                "question": "What is NEOM?",
                "lang": "en",
                "reference_answer": "NEOM is a planned cross-border smart city in the Tabuk Province of northwestern Saudi Arabia, a key project of Vision 2030."
            },
            {
                "question": "ما هو مشروع البحر الأحمر؟",
                "lang": "ar",
                "reference_answer": "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
            }
        ]
        logger.info(f"Created {len(self.eval_data)} sample evaluation examples")

    def retrieve_context(self, query, lang):
        """Retrieve relevant context for a query based on language"""
        start_time = time.time()
        
        try:
            if lang == "ar":
                query_vec = self.arabic_embedder.encode(query)
                D, I = self.arabic_index.search(np.array([query_vec]), k=2)  # Get top 2 most relevant chunks
                context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
            else:
                query_vec = self.english_embedder.encode(query)
                D, I = self.english_index.search(np.array([query_vec]), k=2)  # Get top 2 most relevant chunks
                context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
            
            retrieval_time = time.time() - start_time
            logger.info(f"Retrieved context in {retrieval_time:.2f}s")
            
            return context
        except Exception as e:
            logger.error(f"Error retrieving context: {str(e)}")
            return ""

    def generate_response(self, user_input):
        """Generate a response to user input using the appropriate model and retrieval system"""
        start_time = time.time()
        
        # Default response in case of failure
        default_response = {
            "en": "I apologize, but I couldn't process your request properly. Please try again.",
            "ar": "أعتذر، لم أتمكن من معالجة طلبك بشكل صحيح. الرجاء المحاولة مرة أخرى."
        }
        
        try:
            # Detect language
            try:
                lang = detect(user_input)
                if lang != "ar":  # Simplify to just Arabic vs non-Arabic
                    lang = "en"
            except:
                lang = "en"  # Default fallback
            
            logger.info(f"Detected language: {lang}")
            
            # Retrieve relevant context
            context = self.retrieve_context(user_input, lang)
            
            if lang == "ar":
                # Improved Arabic Prompt
                input_text = (
                    f"أنت خبير في رؤية السعودية 2030.\n"
                    f"إليك بعض المعلومات المهمة:\n{context}\n\n"
                    f"مثال:\n"
                    f"السؤال: ما هي ركائز رؤية 2030؟\n"
                    f"الإجابة: ركائز رؤية 2030 هي مجتمع حيوي، اقتصاد مزدهر، ووطن طموح.\n\n"
                    f"أجب عن سؤال المستخدم بشكل واضح ودقيق، مستندًا إلى المعلومات المقدمة. إذا لم تكن المعلومات متوفرة، أوضح ذلك.\n"
                    f"السؤال: {user_input}\n"
                    f"الإجابة:"
                )
                
                response = self.arabic_pipe(input_text, max_new_tokens=256, do_sample=True, temperature=0.7)
                full_text = response[0]['generated_text']
                
                # Extract the answer part
                answer_pattern = r"الإجابة:(.*?)(?:$)"
                match = re.search(answer_pattern, full_text, re.DOTALL)
                if match:
                    reply = match.group(1).strip()
                else:
                    reply = full_text
            else:
                # Improved English Prompt
                input_text = (
                    f"You are an expert on Saudi Arabia's Vision 2030.\n"
                    f"Here is some relevant information:\n{context}\n\n"
                    f"Example:\n"
                    f"Question: What are the key pillars of Vision 2030?\n"
                    f"Answer: The key pillars are a vibrant society, a thriving economy, and an ambitious nation.\n\n"
                    f"Answer the user's question clearly and accurately based on the provided information. If information is not available, make that clear.\n"
                    f"Question: {user_input}\n"
                    f"Answer:"
                )
                
                response = self.english_pipe(input_text, max_new_tokens=256, do_sample=True, temperature=0.7)
                full_text = response[0]['generated_text']
                
                # Extract the answer part
                answer_pattern = r"Answer:(.*?)(?:$)"
                match = re.search(answer_pattern, full_text, re.DOTALL)
                if match:
                    reply = match.group(1).strip()
                else:
                    reply = full_text
        
        except Exception as e:
            logger.error(f"Error generating response: {str(e)}")
            reply = default_response.get(lang, default_response["en"])
        
        # Record response time
        response_time = time.time() - start_time
        self.metrics["response_times"].append(response_time)
        
        logger.info(f"Generated response in {response_time:.2f}s")
        
        # Store the interaction for later evaluation
        interaction = {
            "timestamp": datetime.now().isoformat(),
            "user_input": user_input,
            "response": reply,
            "language": lang,
            "response_time": response_time
        }
        self.response_history.append(interaction)
        
        return reply

    def evaluate_factual_accuracy(self, response, reference):
        """Simple evaluation of factual accuracy by keyword matching"""
        # This is a simplified approach - in production, use more sophisticated methods
        keywords_reference = set(re.findall(r'\b\w+\b', reference.lower()))
        keywords_response = set(re.findall(r'\b\w+\b', response.lower()))
        
        common_keywords = keywords_reference.intersection(keywords_response)
        
        if len(keywords_reference) > 0:
            accuracy = len(common_keywords) / len(keywords_reference)
        else:
            accuracy = 0
            
        return accuracy

    def evaluate_on_test_set(self):
        """Evaluate the assistant on the test set"""
        logger.info("Running evaluation on test set")
        
        eval_results = []
        
        for example in self.eval_data:
            # Generate response
            response = self.generate_response(example["question"])
            
            # Calculate factual accuracy
            accuracy = self.evaluate_factual_accuracy(response, example["reference_answer"])
            
            eval_results.append({
                "question": example["question"],
                "reference": example["reference_answer"],
                "response": response,
                "factual_accuracy": accuracy
            })
            
            self.metrics["factual_accuracy"].append(accuracy)
        
        # Calculate average factual accuracy
        avg_accuracy = sum(self.metrics["factual_accuracy"]) / len(self.metrics["factual_accuracy"]) if self.metrics["factual_accuracy"] else 0
        avg_response_time = sum(self.metrics["response_times"]) / len(self.metrics["response_times"]) if self.metrics["response_times"] else 0
        
        results = {
            "average_factual_accuracy": avg_accuracy,
            "average_response_time": avg_response_time,
            "detailed_results": eval_results
        }
        
        logger.info(f"Evaluation results: Factual accuracy = {avg_accuracy:.2f}, Avg response time = {avg_response_time:.2f}s")
        
        return results

    def record_user_feedback(self, user_input, response, rating, feedback_text=""):
        """Record user feedback for a response"""
        feedback = {
            "timestamp": datetime.now().isoformat(),
            "user_input": user_input,
            "response": response,
            "rating": rating,
            "feedback_text": feedback_text
        }
        
        self.metrics["user_ratings"].append(rating)
        
        # In a production system, store this in a database
        logger.info(f"Recorded user feedback: rating={rating}")
        
        return True

    def save_evaluation_metrics(self, output_path="evaluation_metrics.json"):
        """Save evaluation metrics to a file"""
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump({
                    "response_times": self.metrics["response_times"],
                    "user_ratings": self.metrics["user_ratings"],
                    "factual_accuracy": self.metrics["factual_accuracy"],
                    "average_factual_accuracy": sum(self.metrics["factual_accuracy"]) / len(self.metrics["factual_accuracy"]) if self.metrics["factual_accuracy"] else 0,
                    "average_response_time": sum(self.metrics["response_times"]) / len(self.metrics["response_times"]) if self.metrics["response_times"] else 0,
                    "average_user_rating": sum(self.metrics["user_ratings"]) / len(self.metrics["user_ratings"]) if self.metrics["user_ratings"] else 0,
                    "timestamp": datetime.now().isoformat()
                }, f, indent=2)
            
            logger.info(f"Saved evaluation metrics to {output_path}")
            return True
        except Exception as e:
            logger.error(f"Error saving evaluation metrics: {str(e)}")
            return False

# --- Gradio UI --- #
def create_gradio_interface():
    # Initialize the assistant
    assistant = Vision2030Assistant()
    
    # Track conversation history
    conversation_history = []
    
    def chat(message, history):
        if not message:
            return history, ""
        
        # Generate response
        reply = assistant.generate_response(message)
        
        # Update history
        history.append((message, reply))
        
        return history, ""
    
    def provide_feedback(message, rating, feedback_text):
        # Find the most recent interaction
        if conversation_history:
            last_interaction = conversation_history[-1]
            assistant.record_user_feedback(last_interaction[0], last_interaction[1], rating, feedback_text)
            return f"Thank you for your feedback! (Rating: {rating}/5)"
        return "No conversation found to rate."
    
    def clear_history():
        conversation_history.clear()
        return []
    
    def download_metrics():
        assistant.save_evaluation_metrics()
        return "evaluation_metrics.json"
    
    def run_evaluation():
        results = assistant.evaluate_on_test_set()
        return f"Evaluation Results:\nFactual Accuracy: {results['average_factual_accuracy']:.2f}\nAverage Response Time: {results['average_response_time']:.2f}s"
    
    # Create Gradio interface
    with gr.Blocks() as demo:
        gr.Markdown("# Vision 2030 Virtual Assistant 🌍\n\nAsk questions about Saudi Vision 2030 in Arabic or English")
        
        with gr.Tab("Chat"):
            chatbot = gr.Chatbot(show_label=False)
            msg = gr.Textbox(label="Ask me anything about Vision 2030", placeholder="Type your question here...")
            clear = gr.Button("Clear Conversation")
            
            with gr.Row():
                with gr.Column(scale=4):
                    feedback_text = gr.Textbox(label="Provide additional feedback (optional)")
                with gr.Column(scale=1):
                    rating = gr.Slider(label="Rate Response (1-5)", minimum=1, maximum=5, step=1, value=3)
            
            submit_feedback = gr.Button("Submit Feedback")
            feedback_result = gr.Textbox(label="Feedback Status")
            
            # Set up event handlers
            msg.submit(chat, [msg, chatbot], [chatbot, msg])
            clear.click(clear_history, None, chatbot)
            submit_feedback.click(provide_feedback, [msg, rating, feedback_text], feedback_result)
        
        with gr.Tab("Evaluation"):
            eval_button = gr.Button("Run Evaluation on Test Set")
            eval_results = gr.Textbox(label="Evaluation Results")
            download_button = gr.Button("Download Metrics")
            download_file = gr.File(label="Download evaluation metrics as JSON")
            
            # Set up evaluation handlers
            eval_button.click(run_evaluation, None, eval_results)
            download_button.click(download_metrics, None, download_file)
        
        with gr.Tab("About"):
            gr.Markdown("""
            ## About Vision 2030 Virtual Assistant
            
            This assistant uses a combination of state-of-the-art language models to answer questions about Saudi Arabia's Vision 2030 strategic framework in both Arabic and English.
            
            ### Features:
            - Bilingual support (Arabic and English)
            - Retrieval-Augmented Generation (RAG) for factual accuracy
            - Evaluation framework for measuring performance
            - User feedback collection for continuous improvement
            
            ### Models Used:
            - Arabic: ALLaM-7B-Instruct-preview
            - English: Mistral-7B-Instruct-v0.2
            - Embeddings: CAMeL-Lab/bert-base-arabic-camelbert-ca and sentence-transformers/all-MiniLM-L6-v2
            
            This project demonstrates the application of advanced NLP techniques for multilingual question answering, particularly for Arabic language support.
            """)
    
    return demo

# Launch the application
if __name__ == "__main__":
    demo = create_gradio_interface()
    demo.launch()