File size: 11,155 Bytes
6750126
f4c0f01
 
6750126
b7669f4
6750126
ae5e187
6750126
64f3706
6750126
 
 
 
5224f4e
6750126
 
 
 
 
f4c0f01
9e85002
6750126
 
 
bc85188
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc85188
6750126
 
 
 
 
bc85188
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e85002
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5224f4e
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5224f4e
6750126
 
 
 
64f3706
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
64f3706
6750126
 
5224f4e
6750126
 
d04e4d9
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bc85188
6750126
 
bc85188
6750126
 
bc85188
6750126
 
 
bc85188
6750126
64f3706
6750126
 
 
 
f63c425
6750126
 
 
bc85188
6750126
 
5224f4e
6750126
 
9df1e5f
6750126
 
 
 
 
 
9df1e5f
6750126
 
 
 
 
 
 
 
ae5e187
6750126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64f3706
6750126
 
64f3706
6750126
 
64f3706
6750126
 
 
 
 
5224f4e
6750126
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import streamlit as st
import os
import re
import torch
import numpy as np
from pathlib import Path
import PyPDF2
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings

# Set page configuration
st.set_page_config(
    page_title="Vision 2030 Virtual Assistant",
    page_icon="🇸🇦",
    layout="wide"
)

# App title and description
st.title("Vision 2030 Virtual Assistant")
st.markdown("Ask questions about Saudi Vision 2030 goals, projects, and progress in Arabic or English.")

# Function definitions
@st.cache_resource
def load_model_and_tokenizer():
    """Load the ALLaM-7B model and tokenizer with error handling"""
    model_name = "ALLaM-AI/ALLaM-7B-Instruct-preview"
    st.info(f"Loading model: {model_name} (this may take a few minutes)")
    
    try:
        # First attempt with AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            use_fast=False
        )
        
        # Load model with appropriate settings for ALLaM
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto",
        )
        
        st.success("Model loaded successfully!")
        
    except Exception as e:
        st.error(f"First loading attempt failed: {e}")
        st.info("Trying alternative loading approach...")
        
        # Try with specific tokenizer class if the first attempt fails
        from transformers import LlamaTokenizer
        
        tokenizer = LlamaTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,
            trust_remote_code=True,
            device_map="auto",
        )
        
        st.success("Model loaded successfully with LlamaTokenizer!")
    
    return model, tokenizer

def detect_language(text):
    """Detect if text is primarily Arabic or English"""
    arabic_chars = re.findall(r'[\u0600-\u06FF]', text)
    is_arabic = len(arabic_chars) > len(text) * 0.5
    return "arabic" if is_arabic else "english"

def process_pdfs():
    """Process uploaded PDF documents"""
    documents = []
    
    if 'uploaded_pdfs' in st.session_state and st.session_state.uploaded_pdfs:
        for pdf_file in st.session_state.uploaded_pdfs:
            try:
                # Save the uploaded file temporarily
                pdf_path = f"temp_{pdf_file.name}"
                with open(pdf_path, "wb") as f:
                    f.write(pdf_file.getbuffer())
                
                # Extract text
                text = ""
                with open(pdf_path, 'rb') as file:
                    reader = PyPDF2.PdfReader(file)
                    for page in reader.pages:
                        text += page.extract_text() + "\n\n"
                
                # Remove temporary file
                os.remove(pdf_path)
                
                if text.strip():  # If we got some text
                    doc = Document(
                        page_content=text,
                        metadata={"source": pdf_file.name, "filename": pdf_file.name}
                    )
                    documents.append(doc)
                    st.info(f"Successfully processed: {pdf_file.name}")
                else:
                    st.warning(f"No text extracted from {pdf_file.name}")
            except Exception as e:
                st.error(f"Error processing {pdf_file.name}: {e}")
    
    st.success(f"Processed {len(documents)} PDF documents")
    return documents

def create_vector_store(documents):
    """Split documents into chunks and create a FAISS vector store"""
    # Text splitter for breaking documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )
    
    # Split documents into chunks
    chunks = []
    for doc in documents:
        doc_chunks = text_splitter.split_text(doc.page_content)
        # Preserve metadata for each chunk
        chunks.extend([
            Document(page_content=chunk, metadata=doc.metadata)
            for chunk in doc_chunks
        ])
    
    st.info(f"Created {len(chunks)} chunks from {len(documents)} documents")
    
    # Create a proper embedding function for LangChain
    embedding_function = HuggingFaceEmbeddings(
        model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    )
    
    # Create FAISS index
    vector_store = FAISS.from_documents(
        chunks,
        embedding_function
    )
    
    return vector_store

def retrieve_context(query, vector_store, top_k=5):
    """Retrieve most relevant document chunks for a given query"""
    # Search the vector store using similarity search
    results = vector_store.similarity_search_with_score(query, k=top_k)
    
    # Format the retrieved contexts
    contexts = []
    for doc, score in results:
        contexts.append({
            "content": doc.page_content,
            "source": doc.metadata.get("source", "Unknown"),
            "relevance_score": score
        })
    
    return contexts

def generate_response(query, contexts, model, tokenizer):
    """Generate a response using retrieved contexts with ALLaM-specific formatting"""
    # Auto-detect language
    language = detect_language(query)
    
    # Format the prompt based on language
    if language == "arabic":
        instruction = (
            "أنت مساعد افتراضي يهتم برؤية السعودية 2030. استخدم المعلومات التالية للإجابة على السؤال. "
            "إذا لم تعرف الإجابة، فقل بأمانة إنك لا تعرف."
        )
    else:  # english
        instruction = (
            "You are a virtual assistant for Saudi Vision 2030. Use the following information to answer the question. "
            "If you don't know the answer, honestly say you don't know."
        )
    
    # Combine retrieved contexts
    context_text = "\n\n".join([f"Document: {ctx['content']}" for ctx in contexts])
    
    # Format the prompt for ALLaM instruction format
    prompt = f"""<s>[INST] {instruction}

Context:
{context_text}

Question: {query} [/INST]</s>"""
    
    try:
        with st.spinner("Generating response..."):
            # Generate response with appropriate parameters for ALLaM
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            
            # Generate with appropriate parameters
            outputs = model.generate(
                inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_new_tokens=512,
                temperature=0.7,
                top_p=0.9,
                do_sample=True,
                repetition_penalty=1.1
            )
            
            # Decode the response
            full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract just the answer part (after the instruction)
            response = full_output.split("[/INST]")[-1].strip()
            
            # If response is empty for some reason, return the full output
            if not response:
                response = full_output
                
            return response, [ctx.get("source", "Unknown") for ctx in contexts]
        
    except Exception as e:
        st.error(f"Error during generation: {e}")
        # Fallback response
        return "I apologize, but I encountered an error while generating a response.", []

# Initialize the app state
if 'conversation_history' not in st.session_state:
    st.session_state.conversation_history = []

if 'vector_store' not in st.session_state:
    st.session_state.vector_store = None

if 'uploaded_pdfs' not in st.session_state:
    st.session_state.uploaded_pdfs = None

# PDF upload section
st.header("1. Upload Vision 2030 Documents")
uploaded_files = st.file_uploader("Upload PDF documents about Vision 2030", 
                                 type=["pdf"], 
                                 accept_multiple_files=True,
                                 help="Upload one or more PDF documents containing information about Vision 2030")

if uploaded_files:
    st.session_state.uploaded_pdfs = uploaded_files
    if st.button("Process PDFs"):
        documents = process_pdfs()
        if documents:
            with st.spinner("Creating vector database..."):
                st.session_state.vector_store = create_vector_store(documents)
            st.success("Vector database created successfully!")

# Load the model (cached)
model, tokenizer = load_model_and_tokenizer()

# Chat interface
st.header("2. Chat with the Vision 2030 Assistant")

# Display conversation history
for message in st.session_state.conversation_history:
    if message["role"] == "user":
        st.markdown(f"**You:** {message['content']}")
    else:
        st.markdown(f"**Assistant:** {message['content']}")
        if 'sources' in message and message['sources']:
            st.markdown(f"*Sources: {', '.join([os.path.basename(src) for src in message['sources']])}*")
    st.divider()

# Input for new question
user_input = st.text_input("Ask a question about Vision 2030 (in Arabic or English):", key="user_query")

# Examples
st.markdown("**Example questions:**")
examples_col1, examples_col2 = st.columns(2)
with examples_col1:
    st.markdown("- What is Saudi Vision 2030?")
    st.markdown("- What are the economic goals of Vision 2030?")
    st.markdown("- How does Vision 2030 support women's empowerment?")
with examples_col2:
    st.markdown("- ما هي رؤية السعودية 2030؟")
    st.markdown("- ما هي الأهداف الاقتصادية لرؤية 2030؟")
    st.markdown("- كيف تدعم رؤية 2030 تمكين المرأة السعودية؟")

# Process the user input
if user_input and st.session_state.vector_store:
    # Add user message to history
    st.session_state.conversation_history.append({"role": "user", "content": user_input})
    
    # Get response
    response, sources = generate_response(user_input, retrieve_context(user_input, st.session_state.vector_store), model, tokenizer)
    
    # Add assistant message to history
    st.session_state.conversation_history.append({"role": "assistant", "content": response, "sources": sources})
    
    # Rerun to update the UI
    st.experimental_rerun()

elif user_input and not st.session_state.vector_store:
    st.warning("Please upload and process Vision 2030 PDF documents first")

# Reset conversation button
if st.button("Reset Conversation") and len(st.session_state.conversation_history) > 0:
    st.session_state.conversation_history = []
    st.experimental_rerun()