Spaces:

vanhai123
/

ragflow-enterprise-search-app

Running

App Files Files Community

vanhai123 commited on May 1

Commit

580e6fc

verified ·

1 Parent(s): fe93241

Update app.py

Browse files

Files changed (1) hide show

app.py +146 -147

app.py CHANGED Viewed

@@ -1,147 +1,146 @@
-import gradio as gr
-import os
-import json
-import shutil
-from datetime import datetime
-from retriever import retriever, reload_retriever
-from generator import answer_query
-from langchain_community.document_loaders import (
-    PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
-)
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-import html
-# Đường dẫn file CSS
-CUSTOM_CSS_PATH = "gradio_theme.css"
-# Quản lý danh sách file upload
-UPLOADED_FILES_JSON = "uploaded_files.json"
-uploaded_files = []
-def save_uploaded_files_to_json():
-    with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
-        json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
-def load_uploaded_files_from_json():
-    global uploaded_files
-    if os.path.exists(UPLOADED_FILES_JSON):
-        with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
-            uploaded_files = json.load(f)
-    else:
-        uploaded_files = []
-def update_uploaded_files():
-    if not uploaded_files:
-        return "_Chưa có tài liệu nào được tải lên._"
-    return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
-        f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
-    )
-# Load khi khởi động
-load_uploaded_files_from_json()
-def process_document(file):
-    file_path = file.name
-    if os.path.exists("vectorstore"):
-        shutil.rmtree("vectorstore")
-    try:
-        if file_path.endswith(".pdf"):
-            loader = PyPDFLoader(file_path)
-        elif file_path.endswith(".csv"):
-            loader = CSVLoader(file_path)
-        elif file_path.endswith(".txt"):
-            loader = TextLoader(file_path, autodetect_encoding=True)   # <== fix lỗi txt
-        elif file_path.endswith(".docx") or file_path.endswith(".doc"):
-            loader = UnstructuredWordDocumentLoader(file_path)
-        else:
-            return "❌ Định dạng file không hỗ trợ.", update_uploaded_files()
-        documents = loader.load()
-    except Exception as e:
-        return f"❌ Lỗi khi tải tài liệu: {e}", update_uploaded_files()
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    docs = splitter.split_documents(documents)
-    if not docs:
-        return "⚠️ Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
-    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-    db = FAISS.from_documents(docs, embeddings)
-    db.save_local("vectorstore")
-    reload_retriever()
-    uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
-    save_uploaded_files_to_json()
-    return f"✅ Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
-def delete_file(filename):
-    global uploaded_files
-    filename = filename.strip()
-    uploaded_files = [f for f in uploaded_files if f["name"] != filename]
-    save_uploaded_files_to_json()
-    return update_uploaded_files()
-def clear_inputs():
-    return "", ""
-def query_function(question, model_choice, temperature, include_sources):
-    answer, docs = answer_query(question, model=model_choice, temperature=temperature)
-    answer = html.escape(answer)
-    if include_sources and docs:
-        unique_sources = set()
-        for doc in docs:
-            section = doc.metadata.get("section")
-            if section:
-                unique_sources.add(section.strip())
-            else:
-                filename = os.path.basename(doc.metadata.get("source", "Unknown"))
-                unique_sources.add(filename.strip())
-        if unique_sources:
-            sources_list = [f"- {src}" for src in sorted(unique_sources)]
-            sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
-            answer += sources_text
-    return answer
-# Giao diện Gradio
-with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
-    with gr.Row():
-        with gr.Column(scale=5):
-            gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
-    with gr.Tabs():
-        with gr.TabItem("🔍 Tìm kiếm"):
-            with gr.Column(elem_classes="container-box"):
-                question = gr.Textbox(lines=3, label="Câu hỏi")
-                with gr.Row():
-                    model_choice = gr.Dropdown(["Gemini Pro", "GPT-3.5", "GPT-4", "Claude"], value="Gemini Pro", label="Mô hình")
-                    temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
-                    include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
-                with gr.Row():
-                    search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
-                    clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
-                output = gr.Markdown(elem_classes="output-box")  # Hiển thị kết quả trong khung đẹp
-            search_btn.click(query_function, inputs=[question, model_choice, temperature, include_sources], outputs=[output])
-            clear_btn.click(clear_inputs, outputs=[question, output])
-        with gr.TabItem("📚 Quản lý tài liệu"):
-            with gr.Column(elem_classes="container-box"):
-                upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
-                upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
-                upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
-            uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
-            with gr.Column(elem_classes="container-box"):
-                delete_filename = gr.Textbox(label="Tên file muốn xóa")
-                delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
-            upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list])
-            delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list])
-    demo.launch(share=True)

+import gradio as gr
+import os
+import json
+import shutil
+from datetime import datetime
+from retriever import retriever, reload_retriever
+from generator import answer_query
+from langchain_community.document_loaders import (
+    PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
+)
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+import html
+# Đường dẫn file CSS
+CUSTOM_CSS_PATH = "gradio_theme.css"
+# Quản lý danh sách file upload
+UPLOADED_FILES_JSON = "uploaded_files.json"
+uploaded_files = []
+def save_uploaded_files_to_json():
+    with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
+        json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
+def load_uploaded_files_from_json():
+    global uploaded_files
+    if os.path.exists(UPLOADED_FILES_JSON):
+        with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
+            uploaded_files = json.load(f)
+    else:
+        uploaded_files = []
+def update_uploaded_files():
+    if not uploaded_files:
+        return "_Chưa có tài liệu nào được tải lên._"
+    return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
+        f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
+    )
+# Load khi khởi động
+load_uploaded_files_from_json()
+def process_document(file):
+    file_path = file.name
+    if os.path.exists("vectorstore"):
+        shutil.rmtree("vectorstore")
+    try:
+        if file_path.endswith(".pdf"):
+            loader = PyPDFLoader(file_path)
+        elif file_path.endswith(".csv"):
+            loader = CSVLoader(file_path)
+        elif file_path.endswith(".txt"):
+            loader = TextLoader(file_path, autodetect_encoding=True)   # <== fix lỗi txt
+        elif file_path.endswith(".docx") or file_path.endswith(".doc"):
+            loader = UnstructuredWordDocumentLoader(file_path)
+        else:
+            return "Định dạng file không hỗ trợ.", update_uploaded_files()
+        documents = loader.load()
+    except Exception as e:
+        return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
+    docs = splitter.split_documents(documents)
+    if not docs:
+        return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
+    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    db = FAISS.from_documents(docs, embeddings)
+    db.save_local("vectorstore")
+    reload_retriever()
+    uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
+    save_uploaded_files_to_json()
+    return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
+def delete_file(filename):
+    global uploaded_files
+    filename = filename.strip()
+    uploaded_files = [f for f in uploaded_files if f["name"] != filename]
+    save_uploaded_files_to_json()
+    return update_uploaded_files()
+def clear_inputs():
+    return "", ""
+def query_function(question, model_choice, temperature, include_sources):
+    answer, docs = answer_query(question, model=model_choice, temperature=temperature)
+    answer = html.escape(answer)
+    if include_sources and docs:
+        unique_sources = set()
+        for doc in docs:
+            section = doc.metadata.get("section")
+            if section:
+                unique_sources.add(section.strip())
+            else:
+                filename = os.path.basename(doc.metadata.get("source", "Unknown"))
+                unique_sources.add(filename.strip())
+        if unique_sources:
+            sources_list = [f"- {src}" for src in sorted(unique_sources)]
+            sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
+            answer += sources_text
+    return answer
+# Giao diện Gradio
+with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
+    with gr.Row():
+        with gr.Column(scale=5):
+            gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
+    with gr.Tabs():
+        with gr.TabItem("🔍 Tìm kiếm"):
+            with gr.Column(elem_classes="container-box"):
+                question = gr.Textbox(lines=3, label="Câu hỏi")
+                with gr.Row():
+                    temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
+                    include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
+                with gr.Row():
+                    search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
+                    clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
+                output = gr.Markdown(elem_classes="output-box")  # Hiển thị kết quả trong khung đẹp
+            search_btn.click(query_function, inputs=[question, model_choice, temperature, include_sources], outputs=[output])
+            clear_btn.click(clear_inputs, outputs=[question, output])
+        with gr.TabItem("📚 Quản lý tài liệu"):
+            with gr.Column(elem_classes="container-box"):
+                upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
+                upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
+                upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
+            uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
+            with gr.Column(elem_classes="container-box"):
+                delete_filename = gr.Textbox(label="Tên file muốn xóa")
+                delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
+            upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list])
+            delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list])
+    demo.launch(share=True)