Update app.py
Browse files
app.py
CHANGED
@@ -6,29 +6,55 @@ import time
|
|
6 |
import re
|
7 |
from datetime import datetime
|
8 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
# Model loading and caching
|
11 |
-
@gr.cache_model
|
12 |
def load_translation_models():
|
13 |
"""Load and cache both translation models"""
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
try:
|
|
|
|
|
15 |
# English to Siswati
|
|
|
16 |
en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
|
17 |
en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
|
18 |
en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
|
19 |
|
20 |
# Siswati to English
|
|
|
21 |
ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
|
22 |
ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
|
23 |
ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
return en_ss_pipeline, ss_en_pipeline
|
|
|
26 |
except Exception as e:
|
27 |
print(f"Error loading models: {e}")
|
28 |
return None, None
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def analyze_siswati_features(text):
|
34 |
"""Analyze Siswati-specific linguistic features"""
|
@@ -83,6 +109,9 @@ def translate_text(text, direction):
|
|
83 |
start_time = time.time()
|
84 |
|
85 |
try:
|
|
|
|
|
|
|
86 |
# Perform translation
|
87 |
if direction == "English β Siswati":
|
88 |
if en_ss_translator is None:
|
@@ -132,7 +161,6 @@ def create_analysis_report(source_metrics, target_metrics, siswati_features, pro
|
|
132 |
### Translation Details
|
133 |
- **Direction**: {direction}
|
134 |
- **Processing Time**: {processing_time:.2f} seconds
|
135 |
-
- **Timestamp**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
136 |
|
137 |
### Text Complexity Metrics
|
138 |
| Metric | Source | Target | Ratio |
|
@@ -178,42 +206,117 @@ def create_metrics_table(source_metrics, target_metrics, processing_time):
|
|
178 |
|
179 |
return pd.DataFrame(data)
|
180 |
|
181 |
-
def
|
182 |
-
"""
|
183 |
if file_obj is None:
|
184 |
return "Please upload a file.", ""
|
185 |
|
|
|
|
|
|
|
|
|
186 |
try:
|
187 |
-
#
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
|
|
|
|
|
|
|
|
197 |
|
198 |
-
#
|
199 |
-
texts =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
results = []
|
202 |
for i, text in enumerate(texts):
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
results.append({
|
|
|
205 |
'Original': text[:100] + '...' if len(text) > 100 else text,
|
206 |
-
'Translation': translated[:100] + '...' if len(translated) > 100 else translated
|
207 |
-
'Index': i + 1
|
208 |
})
|
209 |
|
|
|
|
|
|
|
210 |
results_df = pd.DataFrame(results)
|
211 |
-
summary = f"
|
|
|
|
|
212 |
|
213 |
return summary, results_df
|
214 |
|
215 |
except Exception as e:
|
216 |
return f"Error processing file: {str(e)}", ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
# Define example texts
|
219 |
TRANSLATION_EXAMPLES = [
|
@@ -228,7 +331,7 @@ TRANSLATION_EXAMPLES = [
|
|
228 |
]
|
229 |
|
230 |
def create_gradio_interface():
|
231 |
-
"""Create the main Gradio interface"""
|
232 |
|
233 |
with gr.Blocks(
|
234 |
title="π¬ Siswati-English Linguistic Translation Tool",
|
@@ -274,7 +377,8 @@ def create_gradio_interface():
|
|
274 |
input_text = gr.Textbox(
|
275 |
label="Input Text",
|
276 |
placeholder="Enter text to translate...",
|
277 |
-
lines=4
|
|
|
278 |
)
|
279 |
|
280 |
translate_btn = gr.Button("π Translate & Analyze", variant="primary", size="lg")
|
@@ -323,8 +427,14 @@ def create_gradio_interface():
|
|
323 |
# Batch Processing Tab
|
324 |
with gr.Tab("π Batch Processing"):
|
325 |
gr.Markdown("""
|
326 |
-
### Corpus Analysis & Batch Translation
|
327 |
-
Upload text files or CSV files for batch translation and corpus analysis.
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
""")
|
329 |
|
330 |
with gr.Row():
|
@@ -336,9 +446,10 @@ def create_gradio_interface():
|
|
336 |
)
|
337 |
|
338 |
file_upload = gr.File(
|
339 |
-
label="Upload File",
|
340 |
file_types=[".txt", ".csv"],
|
341 |
-
type="filepath"
|
|
|
342 |
)
|
343 |
|
344 |
batch_btn = gr.Button("π Process Batch", variant="primary")
|
@@ -347,7 +458,8 @@ def create_gradio_interface():
|
|
347 |
**Supported formats:**
|
348 |
- `.txt` files: One text per line
|
349 |
- `.csv` files: Text in first column
|
350 |
-
- **
|
|
|
351 |
""")
|
352 |
|
353 |
with gr.Column():
|
@@ -364,7 +476,7 @@ def create_gradio_interface():
|
|
364 |
)
|
365 |
|
366 |
batch_btn.click(
|
367 |
-
fn=
|
368 |
inputs=[file_upload, batch_direction],
|
369 |
outputs=[batch_summary, batch_results]
|
370 |
)
|
@@ -373,7 +485,7 @@ def create_gradio_interface():
|
|
373 |
with gr.Tab("π¬ Research Tools"):
|
374 |
gr.Markdown("""
|
375 |
### Advanced Linguistic Analysis Tools
|
376 |
-
Explore detailed linguistic features
|
377 |
""")
|
378 |
|
379 |
with gr.Row():
|
@@ -381,7 +493,8 @@ def create_gradio_interface():
|
|
381 |
research_text = gr.Textbox(
|
382 |
label="Text for Analysis",
|
383 |
lines=6,
|
384 |
-
placeholder="Enter Siswati or English text for detailed analysis..."
|
|
|
385 |
)
|
386 |
|
387 |
analyze_btn = gr.Button("π Analyze Text", variant="primary")
|
@@ -392,18 +505,23 @@ def create_gradio_interface():
|
|
392 |
)
|
393 |
|
394 |
def detailed_analysis(text):
|
395 |
-
"""Perform detailed linguistic analysis"""
|
396 |
if not text.strip():
|
397 |
return {}
|
398 |
|
|
|
|
|
|
|
|
|
399 |
metrics = calculate_linguistic_metrics(text)
|
400 |
siswati_features = analyze_siswati_features(text)
|
401 |
|
|
|
402 |
return {
|
403 |
"basic_metrics": metrics,
|
404 |
"siswati_features": siswati_features,
|
405 |
-
"
|
406 |
-
"
|
407 |
}
|
408 |
|
409 |
analyze_btn.click(
|
@@ -440,6 +558,12 @@ def create_gradio_interface():
|
|
440 |
|
441 |
**Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
|
442 |
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
### π Acknowledgments
|
444 |
We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.
|
445 |
|
|
|
6 |
import re
|
7 |
from datetime import datetime
|
8 |
import json
|
9 |
+
import tempfile
|
10 |
+
import os
|
11 |
+
import uuid
|
12 |
+
|
13 |
+
# Global model cache
|
14 |
+
_model_cache = {}
|
15 |
|
|
|
|
|
16 |
def load_translation_models():
|
17 |
"""Load and cache both translation models"""
|
18 |
+
global _model_cache
|
19 |
+
|
20 |
+
# Check if models are already cached
|
21 |
+
if 'en_ss_pipeline' in _model_cache and 'ss_en_pipeline' in _model_cache:
|
22 |
+
return _model_cache['en_ss_pipeline'], _model_cache['ss_en_pipeline']
|
23 |
+
|
24 |
try:
|
25 |
+
print("Loading translation models...")
|
26 |
+
|
27 |
# English to Siswati
|
28 |
+
print("Loading English to Siswati model...")
|
29 |
en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
|
30 |
en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
|
31 |
en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
|
32 |
|
33 |
# Siswati to English
|
34 |
+
print("Loading Siswati to English model...")
|
35 |
ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
|
36 |
ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
|
37 |
ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
|
38 |
|
39 |
+
# Cache the models
|
40 |
+
_model_cache['en_ss_pipeline'] = en_ss_pipeline
|
41 |
+
_model_cache['ss_en_pipeline'] = ss_en_pipeline
|
42 |
+
|
43 |
+
print("Models loaded successfully!")
|
44 |
return en_ss_pipeline, ss_en_pipeline
|
45 |
+
|
46 |
except Exception as e:
|
47 |
print(f"Error loading models: {e}")
|
48 |
return None, None
|
49 |
|
50 |
+
def get_translators():
|
51 |
+
"""Get cached translators, loading them if necessary"""
|
52 |
+
global _model_cache
|
53 |
+
|
54 |
+
if 'en_ss_pipeline' not in _model_cache or 'ss_en_pipeline' not in _model_cache:
|
55 |
+
return load_translation_models()
|
56 |
+
|
57 |
+
return _model_cache['en_ss_pipeline'], _model_cache['ss_en_pipeline']
|
58 |
|
59 |
def analyze_siswati_features(text):
|
60 |
"""Analyze Siswati-specific linguistic features"""
|
|
|
109 |
start_time = time.time()
|
110 |
|
111 |
try:
|
112 |
+
# Get translators (will load if not cached)
|
113 |
+
en_ss_translator, ss_en_translator = get_translators()
|
114 |
+
|
115 |
# Perform translation
|
116 |
if direction == "English β Siswati":
|
117 |
if en_ss_translator is None:
|
|
|
161 |
### Translation Details
|
162 |
- **Direction**: {direction}
|
163 |
- **Processing Time**: {processing_time:.2f} seconds
|
|
|
164 |
|
165 |
### Text Complexity Metrics
|
166 |
| Metric | Source | Target | Ratio |
|
|
|
206 |
|
207 |
return pd.DataFrame(data)
|
208 |
|
209 |
+
def secure_file_processing(file_obj, direction):
|
210 |
+
"""Securely process uploaded files with proper cleanup"""
|
211 |
if file_obj is None:
|
212 |
return "Please upload a file.", ""
|
213 |
|
214 |
+
# Create a unique temporary directory for this processing session
|
215 |
+
session_id = str(uuid.uuid4())
|
216 |
+
temp_dir = None
|
217 |
+
|
218 |
try:
|
219 |
+
# Create secure temporary directory
|
220 |
+
temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
|
221 |
+
|
222 |
+
# Get file extension and validate
|
223 |
+
file_ext = os.path.splitext(file_obj.name)[1].lower()
|
224 |
+
if file_ext not in ['.txt', '.csv']:
|
225 |
+
return "Only .txt and .csv files are supported.", ""
|
226 |
+
|
227 |
+
# Create secure temporary file path
|
228 |
+
temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
|
229 |
+
|
230 |
+
# Copy uploaded file to secure location
|
231 |
+
import shutil
|
232 |
+
shutil.copy2(file_obj.name, temp_file_path)
|
233 |
|
234 |
+
# Process file based on type
|
235 |
+
texts = []
|
236 |
+
if file_ext == '.csv':
|
237 |
+
try:
|
238 |
+
df = pd.read_csv(temp_file_path)
|
239 |
+
if df.empty:
|
240 |
+
return "The uploaded CSV file is empty.", ""
|
241 |
+
# Assume first column contains text to translate
|
242 |
+
texts = df.iloc[:, 0].dropna().astype(str).tolist()
|
243 |
+
except Exception as e:
|
244 |
+
return f"Error reading CSV file: {str(e)}", ""
|
245 |
+
else: # .txt file
|
246 |
+
try:
|
247 |
+
with open(temp_file_path, 'r', encoding='utf-8') as f:
|
248 |
+
content = f.read()
|
249 |
+
texts = [line.strip() for line in content.split('\n') if line.strip()]
|
250 |
+
except Exception as e:
|
251 |
+
return f"Error reading text file: {str(e)}", ""
|
252 |
|
253 |
+
if not texts:
|
254 |
+
return "No text found in the uploaded file.", ""
|
255 |
+
|
256 |
+
# Limit batch size for performance and security
|
257 |
+
max_batch_size = 10
|
258 |
+
if len(texts) > max_batch_size:
|
259 |
+
texts = texts[:max_batch_size]
|
260 |
+
warning_msg = f"Processing limited to first {max_batch_size} entries for security and performance reasons."
|
261 |
+
else:
|
262 |
+
warning_msg = ""
|
263 |
+
|
264 |
+
# Process translations
|
265 |
results = []
|
266 |
for i, text in enumerate(texts):
|
267 |
+
if len(text.strip()) == 0:
|
268 |
+
continue
|
269 |
+
|
270 |
+
# Limit individual text length for security
|
271 |
+
if len(text) > 1000:
|
272 |
+
text = text[:1000] + "..."
|
273 |
+
|
274 |
+
# Get translators for batch processing
|
275 |
+
en_ss_translator, ss_en_translator = get_translators()
|
276 |
+
|
277 |
+
# Perform translation based on direction
|
278 |
+
try:
|
279 |
+
if direction == "English β Siswati":
|
280 |
+
if en_ss_translator is None:
|
281 |
+
translated = "Model not available"
|
282 |
+
else:
|
283 |
+
result = en_ss_translator(text, max_length=512)
|
284 |
+
translated = result[0]['translation_text']
|
285 |
+
else: # Siswati β English
|
286 |
+
if ss_en_translator is None:
|
287 |
+
translated = "Model not available"
|
288 |
+
else:
|
289 |
+
result = ss_en_translator(text, max_length=512)
|
290 |
+
translated = result[0]['translation_text']
|
291 |
+
except Exception as e:
|
292 |
+
translated = f"Translation error: {str(e)}"
|
293 |
+
|
294 |
results.append({
|
295 |
+
'Index': i + 1,
|
296 |
'Original': text[:100] + '...' if len(text) > 100 else text,
|
297 |
+
'Translation': translated[:100] + '...' if len(translated) > 100 else translated
|
|
|
298 |
})
|
299 |
|
300 |
+
if not results:
|
301 |
+
return "No valid text entries found to translate.", ""
|
302 |
+
|
303 |
results_df = pd.DataFrame(results)
|
304 |
+
summary = f"Successfully processed {len(results)} text entries."
|
305 |
+
if warning_msg:
|
306 |
+
summary = f"{summary} {warning_msg}"
|
307 |
|
308 |
return summary, results_df
|
309 |
|
310 |
except Exception as e:
|
311 |
return f"Error processing file: {str(e)}", ""
|
312 |
+
|
313 |
+
finally:
|
314 |
+
# Clean up temporary files and directory
|
315 |
+
if temp_dir and os.path.exists(temp_dir):
|
316 |
+
try:
|
317 |
+
shutil.rmtree(temp_dir)
|
318 |
+
except Exception as e:
|
319 |
+
print(f"Warning: Could not clean up temporary directory: {e}")
|
320 |
|
321 |
# Define example texts
|
322 |
TRANSLATION_EXAMPLES = [
|
|
|
331 |
]
|
332 |
|
333 |
def create_gradio_interface():
|
334 |
+
"""Create the main Gradio interface with security measures"""
|
335 |
|
336 |
with gr.Blocks(
|
337 |
title="π¬ Siswati-English Linguistic Translation Tool",
|
|
|
377 |
input_text = gr.Textbox(
|
378 |
label="Input Text",
|
379 |
placeholder="Enter text to translate...",
|
380 |
+
lines=4,
|
381 |
+
max_lines=10
|
382 |
)
|
383 |
|
384 |
translate_btn = gr.Button("π Translate & Analyze", variant="primary", size="lg")
|
|
|
427 |
# Batch Processing Tab
|
428 |
with gr.Tab("π Batch Processing"):
|
429 |
gr.Markdown("""
|
430 |
+
### Secure Corpus Analysis & Batch Translation
|
431 |
+
Upload text files or CSV files for batch translation and corpus analysis. Files are processed securely and temporarily.
|
432 |
+
|
433 |
+
**Security Features:**
|
434 |
+
- Files are processed in isolated temporary directories
|
435 |
+
- No file persistence or history
|
436 |
+
- Automatic cleanup after processing
|
437 |
+
- Limited to first 10 entries for performance
|
438 |
""")
|
439 |
|
440 |
with gr.Row():
|
|
|
446 |
)
|
447 |
|
448 |
file_upload = gr.File(
|
449 |
+
label="Upload File (Max 5MB)",
|
450 |
file_types=[".txt", ".csv"],
|
451 |
+
type="filepath",
|
452 |
+
file_count="single"
|
453 |
)
|
454 |
|
455 |
batch_btn = gr.Button("π Process Batch", variant="primary")
|
|
|
458 |
**Supported formats:**
|
459 |
- `.txt` files: One text per line
|
460 |
- `.csv` files: Text in first column
|
461 |
+
- **Security limits**: Max 10 entries, 1000 chars per text
|
462 |
+
- **Privacy**: Files are automatically deleted after processing
|
463 |
""")
|
464 |
|
465 |
with gr.Column():
|
|
|
476 |
)
|
477 |
|
478 |
batch_btn.click(
|
479 |
+
fn=secure_file_processing,
|
480 |
inputs=[file_upload, batch_direction],
|
481 |
outputs=[batch_summary, batch_results]
|
482 |
)
|
|
|
485 |
with gr.Tab("π¬ Research Tools"):
|
486 |
gr.Markdown("""
|
487 |
### Advanced Linguistic Analysis Tools
|
488 |
+
Explore detailed linguistic features without data persistence.
|
489 |
""")
|
490 |
|
491 |
with gr.Row():
|
|
|
493 |
research_text = gr.Textbox(
|
494 |
label="Text for Analysis",
|
495 |
lines=6,
|
496 |
+
placeholder="Enter Siswati or English text for detailed analysis...",
|
497 |
+
max_lines=15
|
498 |
)
|
499 |
|
500 |
analyze_btn = gr.Button("π Analyze Text", variant="primary")
|
|
|
505 |
)
|
506 |
|
507 |
def detailed_analysis(text):
|
508 |
+
"""Perform detailed linguistic analysis without storing data"""
|
509 |
if not text.strip():
|
510 |
return {}
|
511 |
|
512 |
+
# Limit text length for security
|
513 |
+
if len(text) > 2000:
|
514 |
+
text = text[:2000] + "..."
|
515 |
+
|
516 |
metrics = calculate_linguistic_metrics(text)
|
517 |
siswati_features = analyze_siswati_features(text)
|
518 |
|
519 |
+
# Return analysis without sensitive information
|
520 |
return {
|
521 |
"basic_metrics": metrics,
|
522 |
"siswati_features": siswati_features,
|
523 |
+
"text_length": len(text),
|
524 |
+
"analysis_completed": True
|
525 |
}
|
526 |
|
527 |
analyze_btn.click(
|
|
|
558 |
|
559 |
**Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
|
560 |
|
561 |
+
### π Privacy & Security
|
562 |
+
- No conversation history is stored
|
563 |
+
- Uploaded files are automatically deleted after processing
|
564 |
+
- All processing happens in isolated temporary environments
|
565 |
+
- No user data persistence
|
566 |
+
|
567 |
### π Acknowledgments
|
568 |
We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.
|
569 |
|