ragflow-enterprise-search-app-Cybersoft

Running

File size: 8,102 Bytes

8cf4b8e

import gradio as gr
import os
from datetime import datetime
from retriever import retriever, reload_retriever
from generator import answer_query
from langchain_community.document_loaders import PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Hàm xử lý upload tài liệu và làm mới FAISS
def process_document(file):
    file_path = file.name

    # Chọn loader theo đuôi file
    if file_path.endswith(".pdf"):
        loader = PyPDFLoader(file_path)
    elif file_path.endswith(".csv"):
        loader = CSVLoader(file_path)
    elif file_path.endswith(".txt"):
        loader = TextLoader(file_path)
    elif file_path.endswith(".docx") or file_path.endswith(".doc"):
        loader = UnstructuredWordDocumentLoader(file_path)
    else:
        return "Định dạng file không hỗ trợ!"

    # Load tài liệu
    documents = loader.load()

    # Cắt chunk văn bản
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    docs = splitter.split_documents(documents)

    if not docs:
        return "Không trích xuất được nội dung từ file tải lên."

    # Tạo FAISS mới 
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = FAISS.from_documents(docs, embeddings)

    db.save_local("vectorstore")
    reload_retriever()

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    return f"Đã xử lý và thêm {len(docs)} đoạn tài liệu vào VectorStore lúc {timestamp}"

# Hàm xử lý tìm kiếm
def query_function(question, model_choice, temperature, include_sources):
    answer, sources = answer_query(question, model=model_choice, temperature=temperature)
    
    if include_sources and sources:
        sources_text = "\n\n**Nguồn tài liệu:**\n"
        for i, doc in enumerate(sources):
            sources_text += f"{i+1}. {doc.page_content}\n"
            if hasattr(doc, 'metadata') and doc.metadata:
                sources_text += f"   - Nguồn: {doc.metadata.get('source', 'Unknown')}\n"
                sources_text += f"   - Trang: {doc.metadata.get('page', 'N/A')}\n"
        result = answer + sources_text
    else:
        result = answer
    result = result.encode('utf-8', errors='ignore').decode('utf-8')
    return result

def clear_inputs():
    return "", []

# Giao diện Gradio
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    with gr.Row():
        with gr.Column(scale=3):
            gr.Markdown(
                """

                # 🔎 RAGFlow Enterprise Search

                ### Công cụ tìm kiếm thông minh dựa trên RAG (Retrieval-Augmented Generation)



                Hệ thống giúp truy xuất và trả lời câu hỏi từ tài liệu nội bộ doanh nghiệp.

                """
            )
    
    with gr.Tabs():
        # Tab tìm kiếm
        with gr.TabItem("Tìm kiếm 🔍"):
            with gr.Row():
                with gr.Column(scale=3):
                    question = gr.Textbox(
                        label="Nhập câu hỏi của bạn:",
                        placeholder="Ví dụ: Quy trình xin nghỉ phép nội bộ là gì?",
                        lines=2
                    )
                with gr.Column(scale=1):
                    model_choice = gr.Dropdown(
                        label="Mô hình AI",
                        choices=["Gemini Pro", "GPT-3.5", "GPT-4", "Claude"],
                        value="Gemini Pro"
                    )
                    temperature = gr.Slider(
                        label="Temperature",
                        minimum=0.0,
                        maximum=1.0,
                        value=0.2,
                        step=0.1
                    )
                    include_sources = gr.Checkbox(
                        label="Hiển thị nguồn tài liệu",
                        value=True
                    )
            
            search_button = gr.Button("🔍 Tìm kiếm", variant="primary")
            clear_button = gr.Button("🗑️ Xóa")
            output = gr.Textbox(
                label="Kết quả tìm kiếm:",
                lines=15,
                interactive=False
            )
            
            search_button.click(
                query_function,
                inputs=[question, model_choice, temperature, include_sources],
                outputs=output
            )
            question.submit(
                query_function,
                inputs=[question, model_choice, temperature, include_sources],
                outputs=output
            )
            clear_button.click(clear_inputs, outputs=[question, output])

        # Tab quản lý tài liệu
        with gr.TabItem("📚 Quản lý tài liệu"):
            with gr.Row():
                with gr.Column():
                    upload_file = gr.File(
                        label="Tải lên tài liệu mới (PDF, Word, CSV, TXT)",
                        file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"]
                    )
                    upload_button = gr.Button("📤 Tải lên và xử lý", variant="primary")
                
                with gr.Column():
                    upload_status = gr.Textbox(
                        label="📄 Trạng thái:",
                        lines=3,
                        interactive=False
                    )
            
            gr.Markdown("### 📊 Danh sách tài liệu đã xử lý")
            upload_button.click(
                process_document,
                inputs=upload_file,
                outputs=upload_status
            )

        # Tab cài đặt (optional)
        with gr.TabItem("⚙️ Cài đặt hệ thống"):
            gr.Markdown("### ⚙️ Cấu hình Vector Store & Embedding")
            with gr.Row():
                with gr.Column():
                    vector_store = gr.Dropdown(
                        label="Vector Store",
                        choices=["FAISS", "Pinecone", "Milvus"],
                        value="FAISS"
                    )
                    embedding_model = gr.Dropdown(
                        label="Embedding Model",
                        choices=["Sentence-Transformers", "OpenAI Embeddings", "Cohere Embeddings"],
                        value="Sentence-Transformers"
                    )
                with gr.Column():
                    chunk_size = gr.Slider(
                        label="Chunk size (độ dài văn bản mỗi đoạn)",
                        minimum=100,
                        maximum=1000,
                        value=500,
                        step=50
                    )
                    chunk_overlap = gr.Slider(
                        label="Chunk overlap (chồng lấp giữa các đoạn)",
                        minimum=0,
                        maximum=200,
                        value=50,
                        step=10
                    )
            
            save_settings = gr.Button("💾 Lưu cài đặt", variant="primary")
            settings_status = gr.Textbox(
                label="🗂️ Trạng thái:",
                interactive=False
            )

            def save_system_settings(vector_store, embedding_model, chunk_size, chunk_overlap):
                return f"✅ Đã lưu: VectorStore={vector_store}, Embedding={embedding_model}, ChunkSize={chunk_size}, Overlap={chunk_size}"

            save_settings.click(
                save_system_settings,
                inputs=[vector_store, embedding_model, chunk_size, chunk_overlap],
                outputs=settings_status
            )


demo.launch()