rahideer commited on
Commit
0cf0218
Β·
verified Β·
1 Parent(s): da6d2e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -88
app.py CHANGED
@@ -1,91 +1,52 @@
1
  import streamlit as st
2
- from langdetect import detect
3
- import faiss
4
- import torch
5
- from sentence_transformers import SentenceTransformer
6
- from transformers import MBartForConditionalGeneration, MBart50Tokenizer
7
- import numpy as np
8
  import pandas as pd
 
9
  import os
10
-
11
- st.set_page_config(page_title="🌍 Multilingual RAG Translator/Answer Bot", layout="centered")
12
-
13
- @st.cache_resource
14
- def load_resources():
15
- embedder = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v1")
16
- tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
17
- model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
18
-
19
- # Load multilingual dataset CSV
20
- df = pd.read_csv("all_languages_test.csv")
21
-
22
- # Construct corpus
23
- corpus = (df["premise"] + " " + df["hypothesis"]).fillna("").tolist()
24
-
25
- # Compute embeddings for corpus
26
- corpus_embeddings = embedder.encode(corpus, convert_to_numpy=True, show_progress_bar=True)
27
-
28
- # Create FAISS index
29
- dimension = corpus_embeddings.shape[1]
30
- index = faiss.IndexFlatL2(dimension)
31
- index.add(corpus_embeddings)
32
-
33
- return embedder, index, corpus, tokenizer, model
34
-
35
- def detect_lang(text):
36
- try:
37
- return detect(text)
38
- except:
39
- return "en"
40
-
41
- def get_top_k_passages(query, embedder, index, corpus, k=3):
42
- query_embedding = embedder.encode([query], convert_to_numpy=True)
43
- distances, indices = index.search(query_embedding, k)
44
- return [corpus[i] for i in indices[0] if i < len(corpus)]
45
-
46
- def generate_answer(query, context, tokenizer, model, src_lang):
47
- model.eval()
48
- tokenizer.src_lang = src_lang
49
- joined_context = " ".join(context)
50
-
51
- inputs = tokenizer(query + " " + joined_context, return_tensors="pt", max_length=1024, truncation=True)
52
- generated_tokens = model.generate(
53
- **inputs,
54
- forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"],
55
- max_length=256,
56
- num_beams=5,
57
- early_stopping=True
58
- )
59
- return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
60
-
61
- st.title("🌍 Multilingual RAG Translator/Answer Bot")
62
- st.caption("Ask in Urdu, French, Hindi, etc., and get culturally-aware, context-grounded answers.")
63
-
64
- query = st.text_input("πŸ’¬ Enter your question in any supported language:")
65
-
66
- if query:
67
- if len(query.strip()) < 3:
68
- st.warning("Please enter a more complete question for better results.")
69
- else:
70
- with st.spinner("Thinking..."):
71
- embedder, index, corpus, tokenizer, model = load_resources()
72
- lang = detect_lang(query)
73
-
74
- lang_map = {
75
- "en": "en_XX", "fr": "fr_XX", "ur": "ur_PK", "hi": "hi_IN",
76
- "es": "es_XX", "de": "de_DE", "zh": "zh_CN", "ar": "ar_AR",
77
- "ru": "ru_RU", "tr": "tr_TR", "it": "it_IT", "pt": "pt_XX",
78
- }
79
-
80
- src_lang = lang_map.get(lang, "en_XX")
81
- context = get_top_k_passages(query, embedder, index, corpus)
82
-
83
- if not context:
84
- st.error("⚠️ Could not find any relevant context to answer your question.")
85
- else:
86
- try:
87
- answer = generate_answer(query, context, tokenizer, model, src_lang)
88
- st.markdown("### πŸ“Œ Answer:")
89
- st.success(answer)
90
- except Exception as e:
91
- st.error(f"⚠️ Something went wrong while generating the answer.\n\n{e}")
 
1
  import streamlit as st
 
 
 
 
 
 
2
  import pandas as pd
3
+ import zipfile
4
  import os
5
+ from sentence_transformers import SentenceTransformer, util
6
+ from transformers import pipeline
7
+
8
+ # Constants
9
+ ZIP_FILE = "xnli-multilingual-nli-dataset.zip"
10
+ CSV_FILE = "en_test.csv"
11
+ EXTRACT_FOLDER = "extracted_data"
12
+
13
+ # Load and extract ZIP
14
+ @st.cache_data
15
+ def extract_and_load():
16
+ if not os.path.exists(EXTRACT_FOLDER):
17
+ with zipfile.ZipFile(ZIP_FILE, "r") as zip_ref:
18
+ zip_ref.extractall(EXTRACT_FOLDER)
19
+ csv_path = os.path.join(EXTRACT_FOLDER, CSV_FILE)
20
+ df = pd.read_csv(csv_path).dropna().sample(500)
21
+ return df[['premise', 'hypothesis', 'label']]
22
+
23
+ df = extract_and_load()
24
+
25
+ # Load models
26
+ nli_model = pipeline("text-classification", model="joeddav/xlm-roberta-large-xnli")
27
+ embedder = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")
28
+
29
+ # UI
30
+ st.title("🌐 Multilingual RAG-style NLI Explorer")
31
+ st.markdown("Enter a sentence in **any language**, and the app will find a related statement from the dataset and infer their relationship.")
32
+
33
+ user_input = st.text_input("Enter your **hypothesis** (your own sentence):")
34
+
35
+ if user_input:
36
+ with st.spinner("Finding most relevant premise..."):
37
+ premise_embeddings = embedder.encode(df['premise'].tolist(), convert_to_tensor=True)
38
+ user_embedding = embedder.encode(user_input, convert_to_tensor=True)
39
+
40
+ top_hit = util.semantic_search(user_embedding, premise_embeddings, top_k=1)[0][0]
41
+ match_idx = top_hit['corpus_id']
42
+ selected_premise = df.iloc[match_idx]['premise']
43
+
44
+ st.subheader("πŸ” Most Relevant Premise:")
45
+ st.write(selected_premise)
46
+
47
+ # Run NLI classification
48
+ full_input = f"{selected_premise} </s> {user_input}"
49
+ result = nli_model(full_input)[0]
50
+
51
+ st.subheader("🧠 Predicted Relationship:")
52
+ st.write(f"**{result['label']}** (confidence: {result['score']:.2f})")