dsfsi-ss-en-m2m100-translation

Running

App Files Files Community

vukosi commited on Jun 15

Commit

be17e77

verified ·

1 Parent(s): 1be67ab

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -36

app.py CHANGED Viewed

@@ -6,29 +6,55 @@ import time
 import re
 from datetime import datetime
 import json
-# Model loading and caching
-@gr.cache_model
 def load_translation_models():
     """Load and cache both translation models"""
     try:
         # English to Siswati
         en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
         en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
         en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
         # Siswati to English
         ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
         ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
         ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
         return en_ss_pipeline, ss_en_pipeline
     except Exception as e:
         print(f"Error loading models: {e}")
         return None, None
-# Load models at startup
-en_ss_translator, ss_en_translator = load_translation_models()
 def analyze_siswati_features(text):
     """Analyze Siswati-specific linguistic features"""
@@ -83,6 +109,9 @@ def translate_text(text, direction):
     start_time = time.time()
     try:
         # Perform translation
         if direction == "English → Siswati":
             if en_ss_translator is None:
@@ -132,7 +161,6 @@ def create_analysis_report(source_metrics, target_metrics, siswati_features, pro
 ### Translation Details
 - **Direction**: {direction}
 - **Processing Time**: {processing_time:.2f} seconds
-- **Timestamp**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
 ### Text Complexity Metrics
 | Metric | Source | Target | Ratio |
@@ -178,42 +206,117 @@ def create_metrics_table(source_metrics, target_metrics, processing_time):
     return pd.DataFrame(data)
-def batch_translate(file_obj, direction):
-    """Process batch translations from uploaded file"""
     if file_obj is None:
         return "Please upload a file.", ""
     try:
-        # Read file content
-        if file_obj.name.endswith('.csv'):
-            df = pd.read_csv(file_obj.name)
-            # Assume first column contains text to translate
-            texts = df.iloc[:, 0].dropna().astype(str).tolist()
-        else:
-            # Plain text file
-            with open(file_obj.name, 'r', encoding='utf-8') as f:
-                content = f.read()
-            texts = [line.strip() for line in content.split('\n') if line.strip()]
-        # Limit batch size for demo
-        texts = texts[:10]  # Process first 10 entries
         results = []
         for i, text in enumerate(texts):
-            translated, _, _ = translate_text(text, direction)
             results.append({
                 'Original': text[:100] + '...' if len(text) > 100 else text,
-                'Translation': translated[:100] + '...' if len(translated) > 100 else translated,
-                'Index': i + 1
             })
         results_df = pd.DataFrame(results)
-        summary = f"Processed {len(results)} texts successfully."
         return summary, results_df
     except Exception as e:
         return f"Error processing file: {str(e)}", ""
 # Define example texts
 TRANSLATION_EXAMPLES = [
@@ -228,7 +331,7 @@ TRANSLATION_EXAMPLES = [
 ]
 def create_gradio_interface():
-    """Create the main Gradio interface"""
     with gr.Blocks(
         title="🔬 Siswati-English Linguistic Translation Tool",
@@ -274,7 +377,8 @@ def create_gradio_interface():
                         input_text = gr.Textbox(
                             label="Input Text",
                             placeholder="Enter text to translate...",
-                            lines=4
                         )
                         translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
@@ -323,8 +427,14 @@ def create_gradio_interface():
             # Batch Processing Tab
             with gr.Tab("📁 Batch Processing"):
                 gr.Markdown("""
-                ### Corpus Analysis & Batch Translation
-                Upload text files or CSV files for batch translation and corpus analysis. Perfect for linguistic research and documentation projects.
                 """)
                 with gr.Row():
@@ -336,9 +446,10 @@ def create_gradio_interface():
                         )
                         file_upload = gr.File(
-                            label="Upload File",
                             file_types=[".txt", ".csv"],
-                            type="filepath"
                         )
                         batch_btn = gr.Button("🔄 Process Batch", variant="primary")
@@ -347,7 +458,8 @@ def create_gradio_interface():
                         **Supported formats:**
                         - `.txt` files: One text per line
                         - `.csv` files: Text in first column
-                        - **Limit**: First 10 entries for demo
                         """)
                     with gr.Column():
@@ -364,7 +476,7 @@ def create_gradio_interface():
                         )
                 batch_btn.click(
-                    fn=batch_translate,
                     inputs=[file_upload, batch_direction],
                     outputs=[batch_summary, batch_results]
                 )
@@ -373,7 +485,7 @@ def create_gradio_interface():
             with gr.Tab("🔬 Research Tools"):
                 gr.Markdown("""
                 ### Advanced Linguistic Analysis Tools
-                Explore detailed linguistic features and export research data.
                 """)
                 with gr.Row():
@@ -381,7 +493,8 @@ def create_gradio_interface():
                         research_text = gr.Textbox(
                             label="Text for Analysis",
                             lines=6,
-                            placeholder="Enter Siswati or English text for detailed analysis..."
                         )
                         analyze_btn = gr.Button("🔍 Analyze Text", variant="primary")
@@ -392,18 +505,23 @@ def create_gradio_interface():
                         )
                 def detailed_analysis(text):
-                    """Perform detailed linguistic analysis"""
                     if not text.strip():
                         return {}
                     metrics = calculate_linguistic_metrics(text)
                     siswati_features = analyze_siswati_features(text)
                     return {
                         "basic_metrics": metrics,
                         "siswati_features": siswati_features,
-                        "text_preview": text[:100] + "..." if len(text) > 100 else text,
-                        "analysis_timestamp": datetime.now().isoformat()
                     }
                 analyze_btn.click(
@@ -440,6 +558,12 @@ def create_gradio_interface():
         **Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
         ### 🙏 Acknowledgments
         We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.

 import re
 from datetime import datetime
 import json
+import tempfile
+import os
+import uuid
+# Global model cache
+_model_cache = {}
 def load_translation_models():
     """Load and cache both translation models"""
+    global _model_cache
+    # Check if models are already cached
+    if 'en_ss_pipeline' in _model_cache and 'ss_en_pipeline' in _model_cache:
+        return _model_cache['en_ss_pipeline'], _model_cache['ss_en_pipeline']
     try:
+        print("Loading translation models...")
         # English to Siswati
+        print("Loading English to Siswati model...")
         en_ss_tokenizer = AutoTokenizer.from_pretrained("dsfsi/en-ss-m2m100-combo")
         en_ss_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/en-ss-m2m100-combo")
         en_ss_pipeline = pipeline("translation", model=en_ss_model, tokenizer=en_ss_tokenizer)
         # Siswati to English
+        print("Loading Siswati to English model...")
         ss_en_tokenizer = AutoTokenizer.from_pretrained("dsfsi/ss-en-m2m100-combo")
         ss_en_model = AutoModelForSeq2SeqLM.from_pretrained("dsfsi/ss-en-m2m100-combo")
         ss_en_pipeline = pipeline("translation", model=ss_en_model, tokenizer=ss_en_tokenizer)
+        # Cache the models
+        _model_cache['en_ss_pipeline'] = en_ss_pipeline
+        _model_cache['ss_en_pipeline'] = ss_en_pipeline
+        print("Models loaded successfully!")
         return en_ss_pipeline, ss_en_pipeline
     except Exception as e:
         print(f"Error loading models: {e}")
         return None, None
+def get_translators():
+    """Get cached translators, loading them if necessary"""
+    global _model_cache
+    if 'en_ss_pipeline' not in _model_cache or 'ss_en_pipeline' not in _model_cache:
+        return load_translation_models()
+    return _model_cache['en_ss_pipeline'], _model_cache['ss_en_pipeline']
 def analyze_siswati_features(text):
     """Analyze Siswati-specific linguistic features"""
     start_time = time.time()
     try:
+        # Get translators (will load if not cached)
+        en_ss_translator, ss_en_translator = get_translators()
         # Perform translation
         if direction == "English → Siswati":
             if en_ss_translator is None:
 ### Translation Details
 - **Direction**: {direction}
 - **Processing Time**: {processing_time:.2f} seconds
 ### Text Complexity Metrics
 | Metric | Source | Target | Ratio |
     return pd.DataFrame(data)
+def secure_file_processing(file_obj, direction):
+    """Securely process uploaded files with proper cleanup"""
     if file_obj is None:
         return "Please upload a file.", ""
+    # Create a unique temporary directory for this processing session
+    session_id = str(uuid.uuid4())
+    temp_dir = None
     try:
+        # Create secure temporary directory
+        temp_dir = tempfile.mkdtemp(prefix=f"translation_{session_id}_")
+        # Get file extension and validate
+        file_ext = os.path.splitext(file_obj.name)[1].lower()
+        if file_ext not in ['.txt', '.csv']:
+            return "Only .txt and .csv files are supported.", ""
+        # Create secure temporary file path
+        temp_file_path = os.path.join(temp_dir, f"upload_{session_id}{file_ext}")
+        # Copy uploaded file to secure location
+        import shutil
+        shutil.copy2(file_obj.name, temp_file_path)
+        # Process file based on type
+        texts = []
+        if file_ext == '.csv':
+            try:
+                df = pd.read_csv(temp_file_path)
+                if df.empty:
+                    return "The uploaded CSV file is empty.", ""
+                # Assume first column contains text to translate
+                texts = df.iloc[:, 0].dropna().astype(str).tolist()
+            except Exception as e:
+                return f"Error reading CSV file: {str(e)}", ""
+        else:  # .txt file
+            try:
+                with open(temp_file_path, 'r', encoding='utf-8') as f:
+                    content = f.read()
+                texts = [line.strip() for line in content.split('\n') if line.strip()]
+            except Exception as e:
+                return f"Error reading text file: {str(e)}", ""
+        if not texts:
+            return "No text found in the uploaded file.", ""
+        # Limit batch size for performance and security
+        max_batch_size = 10
+        if len(texts) > max_batch_size:
+            texts = texts[:max_batch_size]
+            warning_msg = f"Processing limited to first {max_batch_size} entries for security and performance reasons."
+        else:
+            warning_msg = ""
+        # Process translations
         results = []
         for i, text in enumerate(texts):
+            if len(text.strip()) == 0:
+                continue
+            # Limit individual text length for security
+            if len(text) > 1000:
+                text = text[:1000] + "..."
+            # Get translators for batch processing
+            en_ss_translator, ss_en_translator = get_translators()
+            # Perform translation based on direction
+            try:
+                if direction == "English → Siswati":
+                    if en_ss_translator is None:
+                        translated = "Model not available"
+                    else:
+                        result = en_ss_translator(text, max_length=512)
+                        translated = result[0]['translation_text']
+                else:  # Siswati → English
+                    if ss_en_translator is None:
+                        translated = "Model not available"
+                    else:
+                        result = ss_en_translator(text, max_length=512)
+                        translated = result[0]['translation_text']
+            except Exception as e:
+                translated = f"Translation error: {str(e)}"
             results.append({
+                'Index': i + 1,
                 'Original': text[:100] + '...' if len(text) > 100 else text,
+                'Translation': translated[:100] + '...' if len(translated) > 100 else translated
             })
+        if not results:
+            return "No valid text entries found to translate.", ""
         results_df = pd.DataFrame(results)
+        summary = f"Successfully processed {len(results)} text entries."
+        if warning_msg:
+            summary = f"{summary} {warning_msg}"
         return summary, results_df
     except Exception as e:
         return f"Error processing file: {str(e)}", ""
+    finally:
+        # Clean up temporary files and directory
+        if temp_dir and os.path.exists(temp_dir):
+            try:
+                shutil.rmtree(temp_dir)
+            except Exception as e:
+                print(f"Warning: Could not clean up temporary directory: {e}")
 # Define example texts
 TRANSLATION_EXAMPLES = [
 ]
 def create_gradio_interface():
+    """Create the main Gradio interface with security measures"""
     with gr.Blocks(
         title="🔬 Siswati-English Linguistic Translation Tool",
                         input_text = gr.Textbox(
                             label="Input Text",
                             placeholder="Enter text to translate...",
+                            lines=4,
+                            max_lines=10
                         )
                         translate_btn = gr.Button("🔄 Translate & Analyze", variant="primary", size="lg")
             # Batch Processing Tab
             with gr.Tab("📁 Batch Processing"):
                 gr.Markdown("""
+                ### Secure Corpus Analysis & Batch Translation
+                Upload text files or CSV files for batch translation and corpus analysis. Files are processed securely and temporarily.
+                **Security Features:**
+                - Files are processed in isolated temporary directories
+                - No file persistence or history
+                - Automatic cleanup after processing
+                - Limited to first 10 entries for performance
                 """)
                 with gr.Row():
                         )
                         file_upload = gr.File(
+                            label="Upload File (Max 5MB)",
                             file_types=[".txt", ".csv"],
+                            type="filepath",
+                            file_count="single"
                         )
                         batch_btn = gr.Button("🔄 Process Batch", variant="primary")
                         **Supported formats:**
                         - `.txt` files: One text per line
                         - `.csv` files: Text in first column
+                        - **Security limits**: Max 10 entries, 1000 chars per text
+                        - **Privacy**: Files are automatically deleted after processing
                         """)
                     with gr.Column():
                         )
                 batch_btn.click(
+                    fn=secure_file_processing,
                     inputs=[file_upload, batch_direction],
                     outputs=[batch_summary, batch_results]
                 )
             with gr.Tab("🔬 Research Tools"):
                 gr.Markdown("""
                 ### Advanced Linguistic Analysis Tools
+                Explore detailed linguistic features without data persistence.
                 """)
                 with gr.Row():
                         research_text = gr.Textbox(
                             label="Text for Analysis",
                             lines=6,
+                            placeholder="Enter Siswati or English text for detailed analysis...",
+                            max_lines=15
                         )
                         analyze_btn = gr.Button("🔍 Analyze Text", variant="primary")
                         )
                 def detailed_analysis(text):
+                    """Perform detailed linguistic analysis without storing data"""
                     if not text.strip():
                         return {}
+                    # Limit text length for security
+                    if len(text) > 2000:
+                        text = text[:2000] + "..."
                     metrics = calculate_linguistic_metrics(text)
                     siswati_features = analyze_siswati_features(text)
+                    # Return analysis without sensitive information
                     return {
                         "basic_metrics": metrics,
                         "siswati_features": siswati_features,
+                        "text_length": len(text),
+                        "analysis_completed": True
                     }
                 analyze_btn.click(
         **Training Data**: Models trained on the Vuk'uzenzele and ZA-gov-multilingual South African corpora.
+        ### 🔒 Privacy & Security
+        - No conversation history is stored
+        - Uploaded files are automatically deleted after processing
+        - All processing happens in isolated temporary environments
+        - No user data persistence
         ### 🙏 Acknowledgments
         We thank **Thapelo Sindanie** and **Unarine Netshifhefhe** for their contributions to this work.