vanhai123 commited on
Commit
580e6fc
·
verified ·
1 Parent(s): fe93241

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -147
app.py CHANGED
@@ -1,147 +1,146 @@
1
- import gradio as gr
2
- import os
3
- import json
4
- import shutil
5
- from datetime import datetime
6
- from retriever import retriever, reload_retriever
7
- from generator import answer_query
8
- from langchain_community.document_loaders import (
9
- PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
10
- )
11
- from langchain.text_splitter import RecursiveCharacterTextSplitter
12
- from langchain_huggingface import HuggingFaceEmbeddings
13
- from langchain_community.vectorstores import FAISS
14
- import html
15
-
16
- # Đường dẫn file CSS
17
- CUSTOM_CSS_PATH = "gradio_theme.css"
18
-
19
- # Quản lý danh sách file upload
20
- UPLOADED_FILES_JSON = "uploaded_files.json"
21
- uploaded_files = []
22
-
23
- def save_uploaded_files_to_json():
24
- with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
25
- json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
26
-
27
- def load_uploaded_files_from_json():
28
- global uploaded_files
29
- if os.path.exists(UPLOADED_FILES_JSON):
30
- with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
31
- uploaded_files = json.load(f)
32
- else:
33
- uploaded_files = []
34
-
35
- def update_uploaded_files():
36
- if not uploaded_files:
37
- return "_Chưa có tài liệu nào được tải lên._"
38
- return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
39
- f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
40
- )
41
-
42
- # Load khi khởi động
43
- load_uploaded_files_from_json()
44
-
45
- def process_document(file):
46
- file_path = file.name
47
-
48
- if os.path.exists("vectorstore"):
49
- shutil.rmtree("vectorstore")
50
-
51
- try:
52
- if file_path.endswith(".pdf"):
53
- loader = PyPDFLoader(file_path)
54
- elif file_path.endswith(".csv"):
55
- loader = CSVLoader(file_path)
56
- elif file_path.endswith(".txt"):
57
- loader = TextLoader(file_path, autodetect_encoding=True) # <== fix lỗi txt
58
- elif file_path.endswith(".docx") or file_path.endswith(".doc"):
59
- loader = UnstructuredWordDocumentLoader(file_path)
60
- else:
61
- return "Định dạng file không hỗ trợ.", update_uploaded_files()
62
-
63
- documents = loader.load()
64
- except Exception as e:
65
- return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()
66
-
67
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
68
- docs = splitter.split_documents(documents)
69
-
70
- if not docs:
71
- return "⚠️ Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
72
-
73
- embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
74
- db = FAISS.from_documents(docs, embeddings)
75
- db.save_local("vectorstore")
76
- reload_retriever()
77
-
78
- uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
79
- save_uploaded_files_to_json()
80
-
81
- return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
82
-
83
- def delete_file(filename):
84
- global uploaded_files
85
- filename = filename.strip()
86
- uploaded_files = [f for f in uploaded_files if f["name"] != filename]
87
- save_uploaded_files_to_json()
88
- return update_uploaded_files()
89
-
90
- def clear_inputs():
91
- return "", ""
92
-
93
- def query_function(question, model_choice, temperature, include_sources):
94
- answer, docs = answer_query(question, model=model_choice, temperature=temperature)
95
- answer = html.escape(answer)
96
-
97
- if include_sources and docs:
98
- unique_sources = set()
99
- for doc in docs:
100
- section = doc.metadata.get("section")
101
- if section:
102
- unique_sources.add(section.strip())
103
- else:
104
- filename = os.path.basename(doc.metadata.get("source", "Unknown"))
105
- unique_sources.add(filename.strip())
106
- if unique_sources:
107
- sources_list = [f"- {src}" for src in sorted(unique_sources)]
108
- sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
109
- answer += sources_text
110
- return answer
111
-
112
- # Giao diện Gradio
113
- with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
114
- with gr.Row():
115
- with gr.Column(scale=5):
116
- gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
117
-
118
- with gr.Tabs():
119
- with gr.TabItem("🔍 Tìm kiếm"):
120
- with gr.Column(elem_classes="container-box"):
121
- question = gr.Textbox(lines=3, label="Câu hỏi")
122
- with gr.Row():
123
- model_choice = gr.Dropdown(["Gemini Pro", "GPT-3.5", "GPT-4", "Claude"], value="Gemini Pro", label="Mô hình")
124
- temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
125
- include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
126
- with gr.Row():
127
- search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
128
- clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
129
- output = gr.Markdown(elem_classes="output-box") # Hiển thị kết quả trong khung đẹp
130
-
131
- search_btn.click(query_function, inputs=[question, model_choice, temperature, include_sources], outputs=[output])
132
- clear_btn.click(clear_inputs, outputs=[question, output])
133
-
134
- with gr.TabItem("📚 Quản lý tài liệu"):
135
- with gr.Column(elem_classes="container-box"):
136
- upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
137
- upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
138
- upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
139
- uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
140
- with gr.Column(elem_classes="container-box"):
141
- delete_filename = gr.Textbox(label="Tên file muốn xóa")
142
- delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
143
-
144
- upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list])
145
- delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list])
146
-
147
- demo.launch(share=True)
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ import shutil
5
+ from datetime import datetime
6
+ from retriever import retriever, reload_retriever
7
+ from generator import answer_query
8
+ from langchain_community.document_loaders import (
9
+ PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
10
+ )
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain_huggingface import HuggingFaceEmbeddings
13
+ from langchain_community.vectorstores import FAISS
14
+ import html
15
+
16
+ # Đường dẫn file CSS
17
+ CUSTOM_CSS_PATH = "gradio_theme.css"
18
+
19
+ # Quản lý danh sách file upload
20
+ UPLOADED_FILES_JSON = "uploaded_files.json"
21
+ uploaded_files = []
22
+
23
+ def save_uploaded_files_to_json():
24
+ with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
25
+ json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
26
+
27
+ def load_uploaded_files_from_json():
28
+ global uploaded_files
29
+ if os.path.exists(UPLOADED_FILES_JSON):
30
+ with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
31
+ uploaded_files = json.load(f)
32
+ else:
33
+ uploaded_files = []
34
+
35
+ def update_uploaded_files():
36
+ if not uploaded_files:
37
+ return "_Chưa có tài liệu nào được tải lên._"
38
+ return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
39
+ f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
40
+ )
41
+
42
+ # Load khi khởi động
43
+ load_uploaded_files_from_json()
44
+
45
+ def process_document(file):
46
+ file_path = file.name
47
+
48
+ if os.path.exists("vectorstore"):
49
+ shutil.rmtree("vectorstore")
50
+
51
+ try:
52
+ if file_path.endswith(".pdf"):
53
+ loader = PyPDFLoader(file_path)
54
+ elif file_path.endswith(".csv"):
55
+ loader = CSVLoader(file_path)
56
+ elif file_path.endswith(".txt"):
57
+ loader = TextLoader(file_path, autodetect_encoding=True) # <== fix lỗi txt
58
+ elif file_path.endswith(".docx") or file_path.endswith(".doc"):
59
+ loader = UnstructuredWordDocumentLoader(file_path)
60
+ else:
61
+ return "Định dạng file không hỗ trợ.", update_uploaded_files()
62
+
63
+ documents = loader.load()
64
+ except Exception as e:
65
+ return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files()
66
+
67
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
68
+ docs = splitter.split_documents(documents)
69
+
70
+ if not docs:
71
+ return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
72
+
73
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
74
+ db = FAISS.from_documents(docs, embeddings)
75
+ db.save_local("vectorstore")
76
+ reload_retriever()
77
+
78
+ uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
79
+ save_uploaded_files_to_json()
80
+
81
+ return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
82
+
83
+ def delete_file(filename):
84
+ global uploaded_files
85
+ filename = filename.strip()
86
+ uploaded_files = [f for f in uploaded_files if f["name"] != filename]
87
+ save_uploaded_files_to_json()
88
+ return update_uploaded_files()
89
+
90
+ def clear_inputs():
91
+ return "", ""
92
+
93
+ def query_function(question, model_choice, temperature, include_sources):
94
+ answer, docs = answer_query(question, model=model_choice, temperature=temperature)
95
+ answer = html.escape(answer)
96
+
97
+ if include_sources and docs:
98
+ unique_sources = set()
99
+ for doc in docs:
100
+ section = doc.metadata.get("section")
101
+ if section:
102
+ unique_sources.add(section.strip())
103
+ else:
104
+ filename = os.path.basename(doc.metadata.get("source", "Unknown"))
105
+ unique_sources.add(filename.strip())
106
+ if unique_sources:
107
+ sources_list = [f"- {src}" for src in sorted(unique_sources)]
108
+ sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
109
+ answer += sources_text
110
+ return answer
111
+
112
+ # Giao diện Gradio
113
+ with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
114
+ with gr.Row():
115
+ with gr.Column(scale=5):
116
+ gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
117
+
118
+ with gr.Tabs():
119
+ with gr.TabItem("🔍 Tìm kiếm"):
120
+ with gr.Column(elem_classes="container-box"):
121
+ question = gr.Textbox(lines=3, label="Câu hỏi")
122
+ with gr.Row():
123
+ temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
124
+ include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
125
+ with gr.Row():
126
+ search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
127
+ clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
128
+ output = gr.Markdown(elem_classes="output-box") # Hiển thị kết quả trong khung đẹp
129
+
130
+ search_btn.click(query_function, inputs=[question, model_choice, temperature, include_sources], outputs=[output])
131
+ clear_btn.click(clear_inputs, outputs=[question, output])
132
+
133
+ with gr.TabItem("📚 Quản lý tài liệu"):
134
+ with gr.Column(elem_classes="container-box"):
135
+ upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
136
+ upload_btn = gr.Button("📄 Tải lên xử lý", variant="primary")
137
+ upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
138
+ uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
139
+ with gr.Column(elem_classes="container-box"):
140
+ delete_filename = gr.Textbox(label="Tên file muốn xóa")
141
+ delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
142
+
143
+ upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list])
144
+ delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list])
145
+
146
+ demo.launch(share=True)