vukosi commited on
Commit
be17e77
Β·
verified Β·
1 Parent(s): 1be67ab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -36
app.py CHANGED
@@ -6,29 +6,55 @@ import time
6
  import re
7
  from datetime import datetime
8
  import json
 
 
 
 
 
 
9
 
10
- # Model loading and caching
11
- @gr.cache_model
12
  def load_translation_models():
13
  """Load and cache both translation models"""
 
 
 
 
 
 
14
  try:
 
 
15
  # English to Siswati
 
16
  en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
17
  en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
18
  en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
19
 
20
  # Siswati to English
 
21
  ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
22
  ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
23
  ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
24
 
 
 
 
 
 
25
  return en_ss_pipeline, ss_en_pipeline
 
26
  except Exception as e:
27
  print(f"Error loading models: {e}")
28
  return None, None
29
 
30
- # Load models at startup
31
- en_ss_translator, ss_en_translator = load_translation_models()
 
 
 
 
 
 
32
 
33
  def analyze_siswati_features(text):
34
  """Analyze Siswati-specific linguistic features"""
@@ -83,6 +109,9 @@ def translate_text(text, direction):
83
  start_time = time.time()
84
 
85
  try:
 
 
 
86
  # Perform translation
87
  if direction == "English β†’ Siswati":
88
  if en_ss_translator is None:
@@ -132,7 +161,6 @@ def create_analysis_report(source_metrics, target_metrics, siswati_features, pro
132
  ### Translation Details
133
  - **Direction**: {direction}
134
  - **Processing Time**: {processing_time:.2f} seconds
135
- - **Timestamp**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
136
 
137
  ### Text Complexity Metrics
138
  | Metric | Source | Target | Ratio |
@@ -178,42 +206,117 @@ def create_metrics_table(source_metrics, target_metrics, processing_time):
178
 
179
  return pd.DataFrame(data)
180
 
181
- def batch_translate(file_obj, direction):
182
- """Process batch translations from uploaded file"""
183
  if file_obj is None:
184
  return "Please upload a file.", ""
185
 
 
 
 
 
186
  try:
187
- # Read file content
188
- if file_obj.name.endswith('.csv'):
189
- df = pd.read_csv(file_obj.name)
190
- # Assume first column contains text to translate
191
- texts = df.iloc[:, 0].dropna().astype(str).tolist()
192
- else:
193
- # Plain text file
194
- with open(file_obj.name, 'r', encoding='utf-8') as f:
195
- content = f.read()
196
- texts = [line.strip() for line in content.split('\n') if line.strip()]
 
 
 
 
197
 
198
- # Limit batch size for demo
199
- texts = texts[:10] # Process first 10 entries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  results = []
202
  for i, text in enumerate(texts):
203
- translated, _, _ = translate_text(text, direction)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  results.append({
 
205
  'Original': text[:100] + '...' if len(text) > 100 else text,
206
- 'Translation': translated[:100] + '...' if len(translated) > 100 else translated,
207
- 'Index': i + 1
208
  })
209
 
 
 
 
210
  results_df = pd.DataFrame(results)
211
- summary = f"Processed {len(results)} texts successfully."
 
 
212
 
213
  return summary, results_df
214
 
215
  except Exception as e:
216
  return f"Error processing file: {str(e)}", ""
 
 
 
 
 
 
 
 
217
 
218
  # Define example texts
219
  TRANSLATION_EXAMPLES = [
@@ -228,7 +331,7 @@ TRANSLATION_EXAMPLES = [
228
  ]
229
 
230
  def create_gradio_interface():
231
- """Create the main Gradio interface"""
232
 
233
  with gr.Blocks(
234
  title="πŸ”¬ Siswati-English Linguistic Translation Tool",
@@ -274,7 +377,8 @@ def create_gradio_interface():
274
  input_text = gr.Textbox(
275
  label="Input Text",
276
  placeholder="Enter text to translate...",
277
- lines=4
 
278
  )
279
 
280
  translate_btn = gr.Button("πŸ”„ Translate & Analyze", variant="primary", size="lg")
@@ -323,8 +427,14 @@ def create_gradio_interface():
323
  # Batch Processing Tab
324
  with gr.Tab("πŸ“ Batch Processing"):
325
  gr.Markdown("""
326
- ### Corpus Analysis & Batch Translation
327
- Upload text files or CSV files for batch translation and corpus analysis. Perfect for linguistic research and documentation projects.
 
 
 
 
 
 
328
  """)
329
 
330
  with gr.Row():
@@ -336,9 +446,10 @@ def create_gradio_interface():
336
  )
337
 
338
  file_upload = gr.File(
339
- label="Upload File",
340
  file_types=[".txt", ".csv"],
341
- type="filepath"
 
342
  )
343
 
344
  batch_btn = gr.Button("πŸ”„ Process Batch", variant="primary")
@@ -347,7 +458,8 @@ def create_gradio_interface():
347
  **Supported formats:**
348
  - `.txt` files: One text per line
349
  - `.csv` files: Text in first column
350
- - **Limit**: First 10 entries for demo
 
351
  """)
352
 
353
  with gr.Column():
@@ -364,7 +476,7 @@ def create_gradio_interface():
364
  )
365
 
366
  batch_btn.click(
367
- fn=batch_translate,
368
  inputs=[file_upload, batch_direction],
369
  outputs=[batch_summary, batch_results]
370
  )
@@ -373,7 +485,7 @@ def create_gradio_interface():
373
  with gr.Tab("πŸ”¬ Research Tools"):
374
  gr.Markdown("""
375
  ### Advanced Linguistic Analysis Tools
376
- Explore detailed linguistic features and export research data.
377
  """)
378
 
379
  with gr.Row():
@@ -381,7 +493,8 @@ def create_gradio_interface():
381
  research_text = gr.Textbox(
382
  label="Text for Analysis",
383
  lines=6,
384
- placeholder="Enter Siswati or English text for detailed analysis..."
 
385
  )
386
 
387
  analyze_btn = gr.Button("πŸ” Analyze Text", variant="primary")
@@ -392,18 +505,23 @@ def create_gradio_interface():
392
  )
393
 
394
  def detailed_analysis(text):
395
- """Perform detailed linguistic analysis"""
396
  if not text.strip():
397
  return {}
398
 
 
 
 
 
399
  metrics = calculate_linguistic_metrics(text)
400
  siswati_features = analyze_siswati_features(text)
401
 
 
402
  return {
403
  "basic_metrics": metrics,
404
  "siswati_features": siswati_features,
405
- "text_preview": text[:100] + "..." if len(text) > 100 else text,
406
- "analysis_timestamp": datetime.now().isoformat()
407
  }
408
 
409
  analyze_btn.click(
@@ -440,6 +558,12 @@ def create_gradio_interface():
440
 
441
  **Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
442
 
 
 
 
 
 
 
443
  ### πŸ™ Acknowledgments
444
  We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.
445
 
 
6
  import re
7
  from datetime import datetime
8
  import json
9
+ import tempfile
10
+ import os
11
+ import uuid
12
+
13
+ # Global model cache
14
+ _model_cache = {}
15
 
 
 
16
  def load_translation_models():
17
  """Load and cache both translation models"""
18
+ global _model_cache
19
+
20
+ # Check if models are already cached
21
+ if 'en_ss_pipeline' in _model_cache and 'ss_en_pipeline' in _model_cache:
22
+ return _model_cache['en_ss_pipeline'], _model_cache['ss_en_pipeline']
23
+
24
  try:
25
+ print("Loading translation models...")
26
+
27
  # English to Siswati
28
+ print("Loading English to Siswati model...")
29
  en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
30
  en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
31
  en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
32
 
33
  # Siswati to English
34
+ print("Loading Siswati to English model...")
35
  ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
36
  ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
37
  ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
38
 
39
+ # Cache the models
40
+ _model_cache['en_ss_pipeline'] = en_ss_pipeline
41
+ _model_cache['ss_en_pipeline'] = ss_en_pipeline
42
+
43
+ print("Models loaded successfully!")
44
  return en_ss_pipeline, ss_en_pipeline
45
+
46
  except Exception as e:
47
  print(f"Error loading models: {e}")
48
  return None, None
49
 
50
+ def get_translators():
51
+ """Get cached translators, loading them if necessary"""
52
+ global _model_cache
53
+
54
+ if 'en_ss_pipeline' not in _model_cache or 'ss_en_pipeline' not in _model_cache:
55
+ return load_translation_models()
56
+
57
+ return _model_cache['en_ss_pipeline'], _model_cache['ss_en_pipeline']
58
 
59
  def analyze_siswati_features(text):
60
  """Analyze Siswati-specific linguistic features"""
 
109
  start_time = time.time()
110
 
111
  try:
112
+ # Get translators (will load if not cached)
113
+ en_ss_translator, ss_en_translator = get_translators()
114
+
115
  # Perform translation
116
  if direction == "English β†’ Siswati":
117
  if en_ss_translator is None:
 
161
  ### Translation Details
162
  - **Direction**: {direction}
163
  - **Processing Time**: {processing_time:.2f} seconds
 
164
 
165
  ### Text Complexity Metrics
166
  | Metric | Source | Target | Ratio |
 
206
 
207
  return pd.DataFrame(data)
208
 
209
+ def secure_file_processing(file_obj, direction):
210
+ """Securely process uploaded files with proper cleanup"""
211
  if file_obj is None:
212
  return "Please upload a file.", ""
213
 
214
+ # Create a unique temporary directory for this processing session
215
+ session_id = str(uuid.uuid4())
216
+ temp_dir = None
217
+
218
  try:
219
+ # Create secure temporary directory
220
+ temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
221
+
222
+ # Get file extension and validate
223
+ file_ext = os.path.splitext(file_obj.name)[1].lower()
224
+ if file_ext not in ['.txt', '.csv']:
225
+ return "Only .txt and .csv files are supported.", ""
226
+
227
+ # Create secure temporary file path
228
+ temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
229
+
230
+ # Copy uploaded file to secure location
231
+ import shutil
232
+ shutil.copy2(file_obj.name, temp_file_path)
233
 
234
+ # Process file based on type
235
+ texts = []
236
+ if file_ext == '.csv':
237
+ try:
238
+ df = pd.read_csv(temp_file_path)
239
+ if df.empty:
240
+ return "The uploaded CSV file is empty.", ""
241
+ # Assume first column contains text to translate
242
+ texts = df.iloc[:, 0].dropna().astype(str).tolist()
243
+ except Exception as e:
244
+ return f"Error reading CSV file: {str(e)}", ""
245
+ else: # .txt file
246
+ try:
247
+ with open(temp_file_path, 'r', encoding='utf-8') as f:
248
+ content = f.read()
249
+ texts = [line.strip() for line in content.split('\n') if line.strip()]
250
+ except Exception as e:
251
+ return f"Error reading text file: {str(e)}", ""
252
 
253
+ if not texts:
254
+ return "No text found in the uploaded file.", ""
255
+
256
+ # Limit batch size for performance and security
257
+ max_batch_size = 10
258
+ if len(texts) > max_batch_size:
259
+ texts = texts[:max_batch_size]
260
+ warning_msg = f"Processing limited to first {max_batch_size} entries for security and performance reasons."
261
+ else:
262
+ warning_msg = ""
263
+
264
+ # Process translations
265
  results = []
266
  for i, text in enumerate(texts):
267
+ if len(text.strip()) == 0:
268
+ continue
269
+
270
+ # Limit individual text length for security
271
+ if len(text) > 1000:
272
+ text = text[:1000] + "..."
273
+
274
+ # Get translators for batch processing
275
+ en_ss_translator, ss_en_translator = get_translators()
276
+
277
+ # Perform translation based on direction
278
+ try:
279
+ if direction == "English β†’ Siswati":
280
+ if en_ss_translator is None:
281
+ translated = "Model not available"
282
+ else:
283
+ result = en_ss_translator(text, max_length=512)
284
+ translated = result[0]['translation_text']
285
+ else: # Siswati β†’ English
286
+ if ss_en_translator is None:
287
+ translated = "Model not available"
288
+ else:
289
+ result = ss_en_translator(text, max_length=512)
290
+ translated = result[0]['translation_text']
291
+ except Exception as e:
292
+ translated = f"Translation error: {str(e)}"
293
+
294
  results.append({
295
+ 'Index': i + 1,
296
  'Original': text[:100] + '...' if len(text) > 100 else text,
297
+ 'Translation': translated[:100] + '...' if len(translated) > 100 else translated
 
298
  })
299
 
300
+ if not results:
301
+ return "No valid text entries found to translate.", ""
302
+
303
  results_df = pd.DataFrame(results)
304
+ summary = f"Successfully processed {len(results)} text entries."
305
+ if warning_msg:
306
+ summary = f"{summary} {warning_msg}"
307
 
308
  return summary, results_df
309
 
310
  except Exception as e:
311
  return f"Error processing file: {str(e)}", ""
312
+
313
+ finally:
314
+ # Clean up temporary files and directory
315
+ if temp_dir and os.path.exists(temp_dir):
316
+ try:
317
+ shutil.rmtree(temp_dir)
318
+ except Exception as e:
319
+ print(f"Warning: Could not clean up temporary directory: {e}")
320
 
321
  # Define example texts
322
  TRANSLATION_EXAMPLES = [
 
331
  ]
332
 
333
  def create_gradio_interface():
334
+ """Create the main Gradio interface with security measures"""
335
 
336
  with gr.Blocks(
337
  title="πŸ”¬ Siswati-English Linguistic Translation Tool",
 
377
  input_text = gr.Textbox(
378
  label="Input Text",
379
  placeholder="Enter text to translate...",
380
+ lines=4,
381
+ max_lines=10
382
  )
383
 
384
  translate_btn = gr.Button("πŸ”„ Translate & Analyze", variant="primary", size="lg")
 
427
  # Batch Processing Tab
428
  with gr.Tab("πŸ“ Batch Processing"):
429
  gr.Markdown("""
430
+ ### Secure Corpus Analysis & Batch Translation
431
+ Upload text files or CSV files for batch translation and corpus analysis. Files are processed securely and temporarily.
432
+
433
+ **Security Features:**
434
+ - Files are processed in isolated temporary directories
435
+ - No file persistence or history
436
+ - Automatic cleanup after processing
437
+ - Limited to first 10 entries for performance
438
  """)
439
 
440
  with gr.Row():
 
446
  )
447
 
448
  file_upload = gr.File(
449
+ label="Upload File (Max 5MB)",
450
  file_types=[".txt", ".csv"],
451
+ type="filepath",
452
+ file_count="single"
453
  )
454
 
455
  batch_btn = gr.Button("πŸ”„ Process Batch", variant="primary")
 
458
  **Supported formats:**
459
  - `.txt` files: One text per line
460
  - `.csv` files: Text in first column
461
+ - **Security limits**: Max 10 entries, 1000 chars per text
462
+ - **Privacy**: Files are automatically deleted after processing
463
  """)
464
 
465
  with gr.Column():
 
476
  )
477
 
478
  batch_btn.click(
479
+ fn=secure_file_processing,
480
  inputs=[file_upload, batch_direction],
481
  outputs=[batch_summary, batch_results]
482
  )
 
485
  with gr.Tab("πŸ”¬ Research Tools"):
486
  gr.Markdown("""
487
  ### Advanced Linguistic Analysis Tools
488
+ Explore detailed linguistic features without data persistence.
489
  """)
490
 
491
  with gr.Row():
 
493
  research_text = gr.Textbox(
494
  label="Text for Analysis",
495
  lines=6,
496
+ placeholder="Enter Siswati or English text for detailed analysis...",
497
+ max_lines=15
498
  )
499
 
500
  analyze_btn = gr.Button("πŸ” Analyze Text", variant="primary")
 
505
  )
506
 
507
  def detailed_analysis(text):
508
+ """Perform detailed linguistic analysis without storing data"""
509
  if not text.strip():
510
  return {}
511
 
512
+ # Limit text length for security
513
+ if len(text) > 2000:
514
+ text = text[:2000] + "..."
515
+
516
  metrics = calculate_linguistic_metrics(text)
517
  siswati_features = analyze_siswati_features(text)
518
 
519
+ # Return analysis without sensitive information
520
  return {
521
  "basic_metrics": metrics,
522
  "siswati_features": siswati_features,
523
+ "text_length": len(text),
524
+ "analysis_completed": True
525
  }
526
 
527
  analyze_btn.click(
 
558
 
559
  **Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
560
 
561
+ ### πŸ”’ Privacy & Security
562
+ - No conversation history is stored
563
+ - Uploaded files are automatically deleted after processing
564
+ - All processing happens in isolated temporary environments
565
+ - No user data persistence
566
+
567
  ### πŸ™ Acknowledgments
568
  We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.
569