voicechat_XLMR

Runtime error

App Files Files Community

Gopikanth123 commited on Jan 2

Commit

34cc5b3

verified ·

1 Parent(s): 3e21064

Update main.py

Browse files

Files changed (1) hide show

main.py +77 -211

main.py CHANGED Viewed

@@ -1,211 +1,77 @@
-import os
-import shutil
-from flask import Flask, render_template, request, jsonify
-from whoosh.index import create_in, open_dir
-from whoosh.fields import Schema, TEXT
-from whoosh.qparser import QueryParser
-from transformers import AutoTokenizer, AutoModel
-from deep_translator import GoogleTranslator
-# Ensure the necessary directories exist
-PERSIST_DIR = "db"
-PDF_DIRECTORY = 'data'
-os.makedirs(PDF_DIRECTORY, exist_ok=True)
-os.makedirs(PERSIST_DIR, exist_ok=True)
-# Load the XLM-R tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
-modelHere’s the complete corrected code for your Flask application that utilizes the XLM-R model and integrates Whoosh for indexing, ensuring that it handles the creation of indices properly. This should resolve the `EmptyIndexError` you encountered.
-### Complete Code for RAG Chatbot Using XLM-R
-```python
-import os
-import shutil
-import torch
-from flask import Flask, render_template, request, jsonify
-from whoosh.index import create_in, open_dir
-from whoosh.fields import Schema, TEXT
-from whoosh.qparser import QueryParser
-from transformers import AutoTokenizer, AutoModel
-from deep_translator import GoogleTranslator
-# Set up directories
-PERSIST_DIR = "db"
-PDF_DIRECTORY = 'data'
-os.makedirs(PDF_DIRECTORY, exist_ok=True)
-os.makedirs(PERSIST_DIR, exist_ok=True)
-# Load the XLM-R tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
-model = AutoModel.from_pretrained("xlm-roberta-base")
-# Setup Whoosh schema for indexing
-schema = Schema(title=TEXT(stored=True), content=TEXT(stored=True))
-# Create or open the Whoosh index
-def create_index():
-    if not os.path.exists(PERSIST_DIR):
-        os.makedirs(PERSIST_DIR)
-        return create_in(PERSIST_DIR, schema)
-    else:
-        return open_dir(PERSIST_DIR)
-index = create_index()
-# Function to load documents from a directory
-def load_documents():
-    documents = []
-    for filename in os.listdir(PDF_DIRECTORY):
-        if filename.endswith(".txt"):  # Assuming documents are in .txt format
-            with open(os.path.join(PDF_DIRECTORY, filename), 'r', encoding='utf-8') as file:
-                content = file.read()
-                documents.append({'title': filename, 'content': content})
-                print(f"Loaded document: {filename}")  # Debugging line
-    return documents
-# Function to index documents
-def index_documents(documents):
-    writer = index.writer()
-    for doc in documents:
-        writer.add_document(title=doc['title'], content=doc['content'])
-    writer.commit()
-# Data ingestion from the directory
-def data_ingestion_from_directory():
-    # Clear previous data by removing the persist directory
-    if os.path.exists(PERSIST_DIR):
-        shutil.rmtree(PERSIST_DIR)
-    os.makedirs(PERSIST_DIR, exist_ok=True)
-    # Load new documents from the directory
-    new_documents = load_documents()
-    if not new_documents:
-        print("No documents found to index.")
-        return
-    # Re-create index and index documents
-    global index
-    index = create_index()
-    index_documents(new_documents)
-# Function to retrieve documents based on a query
-def retrieve_documents(query):
-    with index.searcher() as searcher:
-        query_parser = QueryParser("content", index.schema)
-        query_object = query_parser.parse(query)
-        results = searcher.search(query_object)
-        return [(result['title'], result['content']) for result in results]
-# Function to generate embeddings (not used in this example, but can be utilized if needed)
-def get_embeddings(text):
-    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling
-    return embeddings.squeeze().numpy()
-# Function to handle queries and generate responses
-def handle_query(query):
-    retrieved_docs = retrieve_documents(query)
-    if not retrieved_docs:
-        return "Sorry, I couldn't find an answer."
-    # Construct a response using the retrieved documents
-    response = "Here are some insights based on your query:\n" + "\n".join(
-        [f"Title: {title}\nContent: {content[:100]}..." for title, content in retrieved_docs]
-    )
-    return response
-# Initialize Flask app
-app = Flask(__name__)
-# Data ingestion
-data_ingestion_from_directory()
-# Generate Response
-def generate_response(query, language):
-    try:
-        # Call the handle_query function to get the response
-        bot_response = handle_query(query)
-        # Map of supported languages
-        supported_languages = {
-            "hindi": "hi",
-            "bengali": "bn",
-            "telugu": "te",
-            "marathi": "mr",
-            "tamil": "ta",
-            "gujarati": "gu",
-            "kannada": "kn",
-            "malayalam": "ml",
-            "punjabi": "pa",
-            "odia": "or",
-            "urdu": "ur",
-            "assamese": "as",
-            "sanskrit": "sa",
-            "arabic": "ar",
-            "australian": "en-AU",
-            "bangla-india": "bn-IN",
-            "chinese": "zh-CN",
-            "dutch": "nl",
-            "french": "fr",
-            "filipino": "tl",
-            "greek": "el",
-            "indonesian": "id",
-            "italian": "it",
-            "japanese": "ja",
-            "korean": "ko",
-            "latin": "la",
-            "nepali": "ne",
-            "portuguese": "pt",
-            "romanian": "ro",
-            "russian": "ru",
-            "spanish": "es",
-            "swedish": "sv",
-            "thai": "th",
-            "ukrainian": "uk",
-            "turkish": "tr"
-        }
-        # Initialize the translated text
-        translated_text = bot_response
-        # Translate only if the language is supported and not English
-        try:
-            if language in supported_languages:
-                target_lang = supported_languages[language]
-                translated_text = GoogleTranslator(source='auto', target=target_lang).translate(bot_response)
-            else:
-                print(f"Unsupported language: {language}")
-        except Exception as e:
-            print(f"Translation error: {e}")
-            translated_text = "Sorry, I couldn't translate the response."
-        return translated_text
-    except Exception as e:
-        return f"Error fetching the response: {str(e)}"
-# Route for the homepage
-@app.route('/')
-def index():
-    return render_template('index.html')
-# Route to handle chatbot messages
-@app.route('/chat', methods=['POST'])
-def chat():
-    try:
-        user_message = request.json.get("message")
-        language = request.json.get("language")
-        if not user_message:
-            return jsonify({"response": "Please say something!"})
-        bot_response = generate_response(user_message, language)
-        return jsonify({"response": bot_response})
-    except Exception as e:
-        return jsonify({"response": f"An error occurred: {str(e)}"})
-if __name__ == '__main__':
-    app.run(debug=True)

+import os
+from flask import Flask, request, jsonify
+from llama_index import SimpleDirectoryReader, StorageContext, VectorStoreIndex, load_index_from_storage, ChatPromptTemplate
+from huggingface_hub import InferenceClient
+from transformers import AutoTokenizer, AutoModel
+from deep_translator import GoogleTranslator
+# Ensure HF_TOKEN is set
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable not set.")
+# Hugging Face model configuration
+REPO_ID = "facebook/xlm-roberta-xl"  # Use xlm-roberta-xl model
+tokenizer = AutoTokenizer.from_pretrained(REPO_ID)
+model = AutoModel.from_pretrained(REPO_ID)
+# Flask app
+app = Flask(__name__)
+# Directories for storing data
+PERSIST_DIR = "db"
+PDF_DIRECTORY = "data"
+os.makedirs(PDF_DIRECTORY, exist_ok=True)
+os.makedirs(PERSIST_DIR, exist_ok=True)
+# Initialize variables
+chat_history = []
+# Function to ingest documents
+def data_ingestion_from_directory():
+    if os.path.exists(PERSIST_DIR):
+        os.system(f"rm -rf {PERSIST_DIR}")  # Clear previous data
+    os.makedirs(PERSIST_DIR, exist_ok=True)
+    documents = SimpleDirectoryReader(PDF_DIRECTORY).load_data()
+    index = VectorStoreIndex.from_documents(documents)
+    index.storage_context.persist(persist_dir=PERSIST_DIR)
+# Function to handle queries
+def handle_query(query):
+    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
+    index = load_index_from_storage(storage_context)
+    query_engine = index.as_query_engine()
+    chat_prompt = ChatPromptTemplate.from_messages([
+        ("user", "User asked: {query_str}"),
+        ("assistant", "Answer: {response}"),
+    ])
+    result = query_engine.query(query, prompt_template=chat_prompt)
+    return result.response if hasattr(result, 'response') else "No relevant answer found."
+# Route for homepage
+@app.route("/")
+def index():
+    return "Welcome to the RAG Application using xlm-roberta-xl!"
+# Route to handle chatbot messages
+@app.route("/chat", methods=["POST"])
+def chat():
+    try:
+        user_message = request.json.get("message")
+        if not user_message:
+            return jsonify({"response": "Please provide a message!"})
+        # Generate response
+        response = handle_query(user_message)
+        chat_history.append({"user": user_message, "bot": response})
+        return jsonify({"response": response})
+    except Exception as e:
+        return jsonify({"response": f"An error occurred: {str(e)}"})
+if __name__ == "__main__":
+    # Ingest data before starting the app
+    data_ingestion_from_directory()
+    app.run(debug=True)