Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

David Pomerenke commited on Mar 7

Commit

4f572a5

1 Parent(s): e92634d

Metrics selector & refactoring

Browse files

Files changed (3) hide show

app.py +253 -106
evals.py +28 -26
results.json +210 -210

app.py CHANGED Viewed

@@ -2,22 +2,74 @@ import json
 import gradio as gr
 import pandas as pd
-import plotly.graph_objects as go
 import plotly.express as px
 import pycountry
 with open("results.json") as f:
     results = json.load(f)
 def mean(lst):
     return sum(lst) / len(lst)
-def create_leaderboard_df(results):
     # Sort languages by average BLEU to determine resource categories
-    langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
-    sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True)
     n_langs = len(sorted_langs)
     high_cutoff = n_langs // 4  # top 25%
     low_cutoff = n_langs - n_langs // 4  # bottom 25%
@@ -45,7 +97,7 @@ def create_leaderboard_df(results):
                     "Mid-Resource": [],
                     "Low-Resource": [],
                 }
-            model_scores[model][category].append(score["bleu"])
     # Calculate average scores and create DataFrame
     leaderboard_data = []
@@ -79,17 +131,17 @@ def create_leaderboard_df(results):
         leaderboard_data.append(
             {
                 "Model": f"[{model_name}](https://openrouter.ai/{model})",
-                "Overall BLEU": overall_avg,
-                "High-Resource BLEU": high_avg,
-                "Mid-Resource BLEU": mid_avg,
-                "Low-Resource BLEU": low_avg,
                 "Languages Tested": len(all_scores),
             }
         )
     # Sort by overall BLEU
     df = pd.DataFrame(leaderboard_data)
-    df = df.sort_values("Overall BLEU", ascending=False)
     # Add rank and medals
     df["Rank"] = range(1, len(df) + 1)
@@ -102,10 +154,10 @@ def create_leaderboard_df(results):
         [
             "Rank",
             "Model",
-            "Overall BLEU",
-            "High-Resource BLEU",
-            "Mid-Resource BLEU",
-            "Low-Resource BLEU",
             "Languages Tested",
         ]
     ]
@@ -126,19 +178,34 @@ def create_leaderboard_df(results):
     )
-def create_model_comparison_plot(results):
     top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]
-    scores_flat = [
-        {"language": lang["language_name"], "model": score["model"], "bleu": score["bleu"]}
-        for lang in top_languages
-        for score in lang["scores"]
-    ]
     df = pd.DataFrame(scores_flat)
-    fig = px.bar(df, x="language", y="bleu", color="model", barmode="group")
     fig.update_layout(
-        title="BLEU Scores by Model and Language",
         xaxis_title=None,
-        yaxis_title="BLEU Score",
         barmode="group",
         height=500,
         legend=dict(
@@ -152,17 +219,18 @@ def create_model_comparison_plot(results):
     return fig
-def create_language_stats_df(results):
     # Create a list to store flattened data
     flat_data = []
     for lang in results:
         # Find the best model and its BLEU score
-        best_score = max(
-            lang["scores"] or [{"overall_score": None, "model": None}], key=lambda x: x["overall_score"]
         )
-        model = best_score["model"]
         model_name = model.split("/")[-1] if model else "N/A"
         model_link = (
             f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
@@ -181,14 +249,14 @@ def create_language_stats_df(results):
             "Overall": round(lang["overall_score"], 3)
             if lang["overall_score"] is not None
             else "N/A",
-            "Trans-lation": round(lang["bleu"], 3)
-            if lang["bleu"] is not None
             else "N/A",
-            "Classi-fication": round(lang["accuracy"], 3)
-            if lang["accuracy"] is not None
             else "N/A",
-            "MLM": round(lang["mlm"], 3)
-            if lang["mlm"] is not None
             else "N/A",
             "Best Model": model_link,
             "CommonVoice Hours": commonvoice_link,
@@ -201,27 +269,54 @@ def create_language_stats_df(results):
         label="Language Results",
         show_search="search",
         datatype=[
-            "markdown", # Language
-            "number", # Speakers
             # "number", # Models Tested
-            "number", # Overall
-            "number", # Translation
-            "number", # Classification
-            "number", # MLM
-            "markdown", # Best Model
-            "markdown", # CommonVoice Hours
         ],
     )
-def create_scatter_plot(results):
     fig = go.Figure()
     x_vals = [
-        lang["speakers"] / 1_000_000 for lang in results if lang["speakers"] >= 10_000
     ]  # Convert to millions
-    y_vals = [lang["bleu"] for lang in results]
-    labels = [lang["language_name"] for lang in results]
     fig.add_trace(
         go.Scatter(
@@ -230,16 +325,14 @@ def create_scatter_plot(results):
             mode="markers+text",
             text=labels,
             textposition="top center",
-            hovertemplate="<b>%{text}</b><br>"
-            + "Speakers: %{x:.1f}M<br>"
-            + "BLEU Score: %{y:.3f}<extra></extra>",
         )
     )
     fig.update_layout(
         title=None,
         xaxis_title="Number of Speakers (Millions)",
-        yaxis_title="Average BLEU Score",
         height=500,
         showlegend=False,
     )
@@ -247,7 +340,7 @@ def create_scatter_plot(results):
     # Use log scale for x-axis since speaker numbers vary widely
     fig.update_xaxes(type="log")
-    return gr.Plot(value=fig, label="Speaker population vs BLEU")
 def format_number(n):
@@ -258,8 +351,10 @@ def format_number(n):
         return f"{n/1_000:.0f}K"
     return str(n)
 def get_population_data():
     import xml.etree.ElementTree as ET
     from language_data.util import data_filename
     filename = data_filename("supplementalData.xml")
@@ -268,17 +363,46 @@ def get_population_data():
     data = {}
     for territory in territories:
-        t_code = territory.attrib['type']
-        t_population = float(territory.attrib['population'])
         data[t_code] = t_population
     return data
-def create_world_map(results):
     # Collect all country data
     population_data = get_population_data()
     country_data = {}
     for lang in results:
-        if "population" not in lang or lang["bleu"] is None:
             continue
         for country_code, speakers in lang["population"].items():
@@ -293,17 +417,19 @@ def create_world_map(results):
                     country_data[iso3_code] = {
                         "total_speakers": 0,
                         "population": population_data.get(country_code, 0),
-                        "weighted_bleu_sum": 0,
                         "languages": [],
                     }
                 country_data[iso3_code]["total_speakers"] += speakers
-                country_data[iso3_code]["weighted_bleu_sum"] += speakers * lang["bleu"]
                 country_data[iso3_code]["languages"].append(
                     {
                         "name": lang["language_name"],
                         "speakers": speakers,
-                        "bleu": lang["bleu"],
                     }
                 )
             except (KeyError, AttributeError):
@@ -312,33 +438,11 @@ def create_world_map(results):
     # Calculate final weighted averages and prepare hover text
     countries = []
-    bleu_scores = []
     hover_texts = []
-    def make_black_bar(value, max_width=10):
-        filled = int(value * max_width)
-        return "⬛️" * filled + "⬜️" * (max_width - filled)
-    def make_colored_bar(value, max_width=10):
-        """Create a colored bar using Unicode blocks
-        🟦 for high values (>0.35)
-        🟨 for medium values (0.25-0.35)
-        🟥 for low values (<0.25)
-        ⬜ for empty space
-        """
-        filled = int(value * max_width)
-        filled = max(0, min(filled, max_width))
-        empty = max_width - filled
-        if value > 0.35:
-            return "🟦" * filled + "⬜" * empty
-        elif value > 0.25:
-            return "🟨" * filled + "⬜" * empty
-        else:
-            return "🟥" * filled + "⬜" * empty
     for country_code, data in country_data.items():
-        weighted_avg = data["weighted_bleu_sum"] / data["total_speakers"]
         try:
             country_name = pycountry.countries.get(alpha_3=country_code).name
@@ -357,38 +461,39 @@ def create_world_map(results):
         for lang in main_langs:
             percentage = (lang["speakers"] / data["population"]) * 100
             speaker_bar = make_black_bar(percentage / 100)
-            bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
             lang_rows.append(
                 f"<b>{lang['name']}</b><br>"
                 f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
-                f"{bleu_bar} {lang['bleu']:.3f} BLEU<br>"
             )
         # Add summary for other languages if any
         if other_langs:
             other_speakers = sum(lang["speakers"] for lang in other_langs)
             other_percentage = (other_speakers / data["population"]) * 100
-            other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
                 other_langs
             )
             speaker_bar = make_black_bar(other_percentage / 100)
-            bleu_bar = make_colored_bar((other_avg_bleu - 0.2) / 0.2)
             lang_rows.append(
                 f"<b>+{len(other_langs)} other languages</b><br>"
                 f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
-                f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
             )
-        hover_text = (
-            f"<b>{country_name}</b><br><br>"
-            f"{'<br>'.join(lang_rows)}"
-        )
         countries.append(country_code)
-        bleu_scores.append(weighted_avg)
         hover_texts.append(hover_text)
     # Create the choropleth map
@@ -396,12 +501,12 @@ def create_world_map(results):
         data=go.Choropleth(
             locations=countries,
             locationmode="ISO-3",
-            z=bleu_scores,
             text=hover_texts,
             hoverinfo="text",
             colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
             colorbar=dict(
-                title="BLEU Score",
                 orientation="h",  # horizontal orientation
                 y=-0.2,  # position below map
                 yanchor="bottom",
@@ -410,13 +515,11 @@ def create_world_map(results):
                 xanchor="center",
                 thickness=20,  # make it a bit thicker when horizontal
             ),
-            zmin=0.1,
-            zmax=0.5,
         )
     )
     fig.update_layout(
-        title=dict(text="BLEU Score by Country", x=0.5, xanchor="center"),
         geo=dict(
             showframe=True,
             showcoastlines=True,
@@ -437,22 +540,48 @@ def create_world_map(results):
     return fig
 # Create the visualization components
-with gr.Blocks(title="AI Language Translation Benchmark") as demo:
-    gr.Markdown("# AI Language Translation Benchmark")
     gr.Markdown(
-        "Comparing translation performance across different AI models and languages"
     )
-    bar_plot = create_model_comparison_plot(results)
-    world_map = create_world_map(results)
-    create_leaderboard_df(results)
-    gr.Plot(value=bar_plot, label="Model Comparison")
-    create_language_stats_df(results)
-    create_scatter_plot(results)
-    gr.Plot(value=world_map, container=False, elem_classes="fullwidth-plot")
     gr.Markdown(
         """
@@ -475,5 +604,23 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
     """,
         container=True,
     )
 demo.launch()

 import gradio as gr
 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
 import pycountry
 with open("results.json") as f:
     results = json.load(f)
+# Global constants for metric mappings
+METRICS = {
+    "overall_performance": {
+        "display_name": "Overall Performance",
+        "field_name": "overall_score",
+        "label": "Overall Performance Score",
+        "explanation": """
+    **Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
+    Higher scores indicate better overall language capabilities.
+    """,
+    },
+    "translation_bleu": {
+        "display_name": "Translation (BLEU)",
+        "field_name": "mt_bleu",
+        "label": "BLEU Score",
+        "explanation": """
+    **Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
+    It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
+    """,
+    },
+    "translation_chrf": {
+        "display_name": "Translation (ChrF)",
+        "field_name": "mt_chrf",
+        "label": "ChrF Score",
+        "explanation": """
+    **Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
+    This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
+    Higher scores (0-1) indicate better translations.
+    """,
+    },
+    "classification_accuracy": {
+        "display_name": "Classification (Accuracy)",
+        "field_name": "cls_acc",
+        "label": "Classification Accuracy",
+        "explanation": """
+    **Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
+    This evaluates a model's understanding of content and context across different languages.
+    Reported as a percentage where higher values indicate better classification performance.
+    """,
+    },
+    "mlm_chrf": {
+        "display_name": "Masked Language Modeling (ChrF)",
+        "field_name": "mlm_chrf",
+        "label": "MLM ChrF Score",
+        "explanation": """
+    **Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
+    This tests a model's understanding of language structure and semantics by measuring the character-level similarity
+    between predicted and actual text. Higher scores indicate better language understanding.
+    """,
+    },
+}
 def mean(lst):
     return sum(lst) / len(lst)
+def create_leaderboard_df(metric):
     # Sort languages by average BLEU to determine resource categories
+    langs_with_score = [lang for lang in results if lang[metric['field_name']] is not None]
+    sorted_langs = sorted(langs_with_score, key=lambda x: x[metric['field_name']], reverse=True)
     n_langs = len(sorted_langs)
     high_cutoff = n_langs // 4  # top 25%
     low_cutoff = n_langs - n_langs // 4  # bottom 25%
                     "Mid-Resource": [],
                     "Low-Resource": [],
                 }
+            model_scores[model][category].append(score[metric['field_name']])
     # Calculate average scores and create DataFrame
     leaderboard_data = []
         leaderboard_data.append(
             {
                 "Model": f"[{model_name}](https://openrouter.ai/{model})",
+                "Overall Score": overall_avg,
+                "High-Resource Score": high_avg,
+                "Mid-Resource Score": mid_avg,
+                "Low-Resource Score": low_avg,
                 "Languages Tested": len(all_scores),
             }
         )
     # Sort by overall BLEU
     df = pd.DataFrame(leaderboard_data)
+    df = df.sort_values("Overall Score", ascending=False)
     # Add rank and medals
     df["Rank"] = range(1, len(df) + 1)
         [
             "Rank",
             "Model",
+            "Overall Score",
+            "High-Resource Score",
+            "Mid-Resource Score",
+            "Low-Resource Score",
             "Languages Tested",
         ]
     ]
     )
+def create_model_comparison_plot(metric):
     top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]
+    # Create appropriate title and y-axis label based on metric
+    title = f"{metric['display_name']} by Model and Language"
+    y_label = metric['label']
+    # Flatten the data for the selected metric
+    scores_flat = []
+    for lang in top_languages:
+        for score in lang["scores"]:
+            # Get the value directly using the field name
+            value = score[metric['field_name']]
+            if value is not None:
+                scores_flat.append(
+                    {
+                        "language": lang["language_name"],
+                        "model": score["model"],
+                        "value": value,
+                    }
+                )
     df = pd.DataFrame(scores_flat)
+    fig = px.bar(df, x="language", y="value", color="model", barmode="group")
     fig.update_layout(
+        title=title,
         xaxis_title=None,
+        yaxis_title=y_label,
         barmode="group",
         height=500,
         legend=dict(
     return fig
+def create_language_stats_df(metric):
     # Create a list to store flattened data
     flat_data = []
     for lang in results:
         # Find the best model and its BLEU score
+        best_model = max(
+            lang["scores"] or [{"overall_score": None, "model": None}],
+            key=lambda x: x["overall_score"],
         )
+        model = best_model["model"]
         model_name = model.split("/")[-1] if model else "N/A"
         model_link = (
             f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
             "Overall": round(lang["overall_score"], 3)
             if lang["overall_score"] is not None
             else "N/A",
+            "Trans-lation": round(lang["mt_bleu"], 3)
+            if lang["mt_bleu"] is not None
             else "N/A",
+            "Classi-fication": round(lang["cls_acc"], 3)
+            if lang["cls_acc"] is not None
             else "N/A",
+            "MLM": round(lang["mlm_chrf"], 3)
+            if lang["mlm_chrf"] is not None
             else "N/A",
             "Best Model": model_link,
             "CommonVoice Hours": commonvoice_link,
         label="Language Results",
         show_search="search",
         datatype=[
+            "markdown",  # Language
+            "number",  # Speakers
             # "number", # Models Tested
+            "number",  # Overall
+            "number",  # Translation
+            "number",  # Classification
+            "number",  # MLM
+            "markdown",  # Best Model
+            "markdown",  # CommonVoice Hours
         ],
     )
+def create_scatter_plot(metric):
+    # Filter results to include only languages with sufficient speakers
+    filtered_results = [lang for lang in results if lang["speakers"] >= 10_000]
+    # Create a list to store data for the scatter plot
+    scatter_data = []
+    for lang in filtered_results:
+        # Calculate average score for this metric across all models
+        scores = [
+            score[metric['field_name']]
+            for score in lang["scores"]
+            if score[metric['field_name']] is not None
+        ]
+        if scores:  # Only include if we have valid scores
+            avg_score = sum(scores) / len(scores)
+            scatter_data.append(
+                {
+                    "language": lang["language_name"],
+                    "speakers": lang["speakers"],
+                    "score": avg_score,
+                }
+            )
     fig = go.Figure()
+    # Convert speakers to millions for display
     x_vals = [
+        data["speakers"] / 1_000_000 for data in scatter_data
     ]  # Convert to millions
+    y_vals = [data["score"] for data in scatter_data]
+    labels = [data["language"] for data in scatter_data]
+    # Create hover template
+    hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
     fig.add_trace(
         go.Scatter(
             mode="markers+text",
             text=labels,
             textposition="top center",
+            hovertemplate=hover_template,
         )
     )
     fig.update_layout(
         title=None,
         xaxis_title="Number of Speakers (Millions)",
+        yaxis_title=metric['label'],
         height=500,
         showlegend=False,
     )
     # Use log scale for x-axis since speaker numbers vary widely
     fig.update_xaxes(type="log")
+    return fig
 def format_number(n):
         return f"{n/1_000:.0f}K"
     return str(n)
 def get_population_data():
     import xml.etree.ElementTree as ET
     from language_data.util import data_filename
     filename = data_filename("supplementalData.xml")
     data = {}
     for territory in territories:
+        t_code = territory.attrib["type"]
+        t_population = float(territory.attrib["population"])
         data[t_code] = t_population
     return data
+# Helper functions for visualization
+def make_black_bar(value, max_width=10):
+    filled = int(value * max_width)
+    return "⬛️" * filled + "⬜️" * (max_width - filled)
+def make_colored_bar(score, max_width=10):
+    """Create a colored bar using Unicode blocks based on normalized score
+    🟦 for high values (>0.35)
+    🟨 for medium values (0.25-0.35)
+    🟥 for low values (<0.25)
+    ⬜ for empty space
+    This function handles both normalization and bar creation.
+    """
+    # Create the bar based on normalized value
+    filled = int(score * max_width)
+    filled = max(0, min(filled, max_width))
+    empty = max_width - filled
+    if score > 0.35:
+        return "🟦" * filled + "⬜" * empty
+    elif score > 0.25:
+        return "🟨" * filled + "⬜" * empty
+    else:
+        return "🟥" * filled + "⬜" * empty
+def create_world_map(metric):
     # Collect all country data
     population_data = get_population_data()
     country_data = {}
     for lang in results:
+        # Skip languages without the required data
+        if "population" not in lang or lang[metric['field_name']] is None:
             continue
         for country_code, speakers in lang["population"].items():
                     country_data[iso3_code] = {
                         "total_speakers": 0,
                         "population": population_data.get(country_code, 0),
+                        "weighted_score_sum": 0,
                         "languages": [],
                     }
                 country_data[iso3_code]["total_speakers"] += speakers
+                country_data[iso3_code]["weighted_score_sum"] += (
+                    speakers * lang[metric['field_name']]
+                )
                 country_data[iso3_code]["languages"].append(
                     {
                         "name": lang["language_name"],
                         "speakers": speakers,
+                        "score": lang[metric['field_name']],
                     }
                 )
             except (KeyError, AttributeError):
     # Calculate final weighted averages and prepare hover text
     countries = []
+    scores = []
     hover_texts = []
     for country_code, data in country_data.items():
+        weighted_avg = data["weighted_score_sum"] / data["total_speakers"]
         try:
             country_name = pycountry.countries.get(alpha_3=country_code).name
         for lang in main_langs:
             percentage = (lang["speakers"] / data["population"]) * 100
             speaker_bar = make_black_bar(percentage / 100)
+            # Use the integrated make_colored_bar function directly
+            score_bar = make_colored_bar(lang["score"])
             lang_rows.append(
                 f"<b>{lang['name']}</b><br>"
                 f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
+                f"{score_bar} {lang['score']:.3f} {metric['label']}<br>"
             )
         # Add summary for other languages if any
         if other_langs:
             other_speakers = sum(lang["speakers"] for lang in other_langs)
             other_percentage = (other_speakers / data["population"]) * 100
+            other_avg_score = sum(lang["score"] for lang in other_langs) / len(
                 other_langs
             )
             speaker_bar = make_black_bar(other_percentage / 100)
+            # Use the integrated make_colored_bar function directly
+            score_bar = make_colored_bar(other_avg_score)
             lang_rows.append(
                 f"<b>+{len(other_langs)} other languages</b><br>"
                 f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
+                f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>"
             )
+        hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}"
         countries.append(country_code)
+        scores.append(weighted_avg)
         hover_texts.append(hover_text)
     # Create the choropleth map
         data=go.Choropleth(
             locations=countries,
             locationmode="ISO-3",
+            z=scores,
             text=hover_texts,
             hoverinfo="text",
             colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
             colorbar=dict(
+                title=metric['label'],
                 orientation="h",  # horizontal orientation
                 y=-0.2,  # position below map
                 yanchor="bottom",
                 xanchor="center",
                 thickness=20,  # make it a bit thicker when horizontal
             ),
         )
     )
     fig.update_layout(
+        title=dict(text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"),
         geo=dict(
             showframe=True,
             showcoastlines=True,
     return fig
+def create_metric_explanation(metric):
+    return gr.Markdown(metric['explanation'])
 # Create the visualization components
+with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
+    gr.Markdown("# AI Language Proficiency Benchmark")
     gr.Markdown(
+        "Comparing language proficiency across different models and languages."
+    )
+    start_metric = METRICS["overall_performance"]
+    metric = gr.Dropdown(
+        choices=[
+            metric_info["display_name"]
+            for metric_info in METRICS.values()
+        ],
+        value=start_metric["display_name"],
+        label="Select Metric",
+        interactive=True,
     )
+    metric_explanation = create_metric_explanation(start_metric)
+    gr.Markdown("## Model Comparison")
+    create_leaderboard_df(start_metric)
+    model_comparison_plot = gr.Plot(
+        value=create_model_comparison_plot(start_metric),
+        label="Model Comparison",
+    )
+    gr.Markdown("## Language Stats")
+    create_language_stats_df(start_metric)
+    scatter_plot = gr.Plot(
+        value=create_scatter_plot(start_metric),
+        label="Speaker Population vs. Metric",
+    )
+    world_map = gr.Plot(
+        value=create_world_map(start_metric),
+        label="World Map",
+        container=False,
+        elem_classes="fullwidth-plot",
+    )
     gr.Markdown(
         """
     """,
         container=True,
     )
+    def update_component(fn, metric_choice):
+        metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
+        return fn(metric)
+    from functools import partial
+    # Connect the dropdown to update all plots
+    metric.change(fn=partial(update_component, create_metric_explanation), inputs=metric, outputs=metric_explanation)
+    metric.change(
+        fn=partial(update_component, create_model_comparison_plot), inputs=metric, outputs=model_comparison_plot
+    )
+    metric.change(
+        fn=partial(update_component, create_scatter_plot), inputs=metric, outputs=scatter_plot
+    )
+    metric.change(
+        fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
+    )
 demo.launch()

evals.py CHANGED Viewed

@@ -209,8 +209,8 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
     return {
         "model": model,
         "bcp_47": original_language["bcp_47"],
-        "bleu": bleu_score["bleu"],
-        "chrf": chrf_score["score"],
         "sentence_nr": sentence_nr,
     }
@@ -316,7 +316,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
     return {
         "model": model,
         "bcp_47": language["bcp_47"],
-        "chrf": chrf_score["score"],
         "sentence_nr": nr,
     }
@@ -352,7 +352,7 @@ async def main():
     classification_scores = await tqdm_asyncio.gather(
         *classification_scores, miniters=1
     )
-    print("evaluate mlm")
     mlm_scores = [
         mlm_and_evaluate(model, language.bcp_47, i)
         for i in range(n_sentences)
@@ -362,9 +362,9 @@ async def main():
         and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
     ]
     mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
-    results = []
     for language in languages.itertuples():
-        results_for_language = []
         for model in models:
             translations_for_model = [
                 score
@@ -381,36 +381,38 @@ async def main():
                 for score in mlm_scores
                 if score["bcp_47"] == language.bcp_47 and score["model"] == model
             ]
-            bleu = mean([s["bleu"] for s in translations_for_model])
-            chrf = mean([s["chrf"] for s in translations_for_model])
-            accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
-            mlm = mean([s["chrf"] for s in mlm_for_model]) / 100
-            overall_score = (bleu + accuracy + mlm) / 3
             if translations_for_model:
-                results_for_language.append(
                     {
                         "model": model,
-                        "bleu": bleu,
-                        "chrf": chrf,
-                        "accuracy": accuracy,
-                        "mlm": mlm,
                         "overall_score": overall_score,
                     }
                 )
-        if results_for_language:
-            results.append(
                 {
                     "language_name": language.language_name,
                     "bcp_47": language.bcp_47,
                     "speakers": language.speakers,
-                    "scores": results_for_language,
-                    "bleu": mean([s["bleu"] for s in results_for_language]),
-                    "chrf": mean([s["chrf"] for s in results_for_language]),
-                    "accuracy": mean([s["accuracy"] for s in results_for_language]),
-                    "mlm": mean([s["mlm"] for s in results_for_language]),
-                    "overall_score": mean(
-                        [s["overall_score"] for s in results_for_language]
                     ),
                     "commonvoice_hours": language.commonvoice_hours
                     if not pd.isna(language.commonvoice_hours)
                     else None,
@@ -421,7 +423,7 @@ async def main():
                 }
             )
     with open("results.json", "w") as f:
-        json.dump(results, f, indent=2, ensure_ascii=False)
 if __name__ == "__main__":

     return {
         "model": model,
         "bcp_47": original_language["bcp_47"],
+        "mt_bleu": bleu_score["bleu"],
+        "mt_chrf": chrf_score["score"],
         "sentence_nr": sentence_nr,
     }
     return {
         "model": model,
         "bcp_47": language["bcp_47"],
+        "mlm_chrf": chrf_score["score"],
         "sentence_nr": nr,
     }
     classification_scores = await tqdm_asyncio.gather(
         *classification_scores, miniters=1
     )
+    print("evaluate masked language modeling")
     mlm_scores = [
         mlm_and_evaluate(model, language.bcp_47, i)
         for i in range(n_sentences)
         and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
     ]
     mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
+    all_results = []
     for language in languages.itertuples():
+        results = []
         for model in models:
             translations_for_model = [
                 score
                 for score in mlm_scores
                 if score["bcp_47"] == language.bcp_47 and score["model"] == model
             ]
+            mt_bleu = mean([s["mt_bleu"] for s in translations_for_model])
+            mt_chrf = mean([s["mt_chrf"] for s in translations_for_model])
+            cls_acc = mean([s["true"] == s["pred"] for s in classifications_for_model])
+            mlm_chrf = mean([s["mlm_chrf"] for s in mlm_for_model])
+            overall_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
             if translations_for_model:
+                results.append(
                     {
                         "model": model,
+                        "mt_bleu": mt_bleu,
+                        "mt_chrf": mt_chrf,
+                        "cls_acc": cls_acc,
+                        "mlm_chrf": mlm_chrf,
                         "overall_score": overall_score,
                     }
                 )
+        if results:
+            all_results.append(
                 {
                     "language_name": language.language_name,
                     "bcp_47": language.bcp_47,
                     "speakers": language.speakers,
+                    "scores": results,
+                    "mt_bleu": mean([s["mt_bleu"] for s in results]),
+                    "mt_chrf": mean([s["mt_chrf"] for s in results]),
+                    "cls_acc": mean(
+                        [s["cls_acc"] for s in results]
                     ),
+                    "mlm_chrf": mean(
+                        [s["mlm_chrf"] for s in results]
+                    ),
+                    "overall_score": mean([s["overall_score"] for s in results]),
                     "commonvoice_hours": language.commonvoice_hours
                     if not pd.isna(language.commonvoice_hours)
                     else None,
                 }
             )
     with open("results.json", "w") as f:
+        json.dump(all_results, f, indent=2, ensure_ascii=False)
 if __name__ == "__main__":

results.json CHANGED Viewed

@@ -6,50 +6,50 @@
     "scores": [
       {
         "model": "openai/gpt-4o-mini",
-        "bleu": 0.89404322120213,
-        "chrf": 92.53933977489264,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.9778605197038973,
-        "overall_score": 0.8128568025242314
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.4351349353198866,
-        "chrf": 54.9504915580248,
-        "accuracy": 0.6,
-        "mlm": 0.9681484728467826,
-        "overall_score": 0.6677611360555563
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
-        "bleu": 0.8800468872938262,
-        "chrf": 94.30164664106223,
-        "accuracy": 0.5333333333333333,
-        "mlm": 0.804094099273989,
-        "overall_score": 0.7391581066337162
       },
       {
         "model": "google/gemini-2.0-flash-001",
-        "bleu": 0.8489646963773831,
-        "chrf": 92.73129066280984,
-        "accuracy": 0.8666666666666667,
-        "mlm": 0.9770616407001859,
-        "overall_score": 0.8975643345814119
       },
       {
         "model": "microsoft/phi-4",
-        "bleu": 0.8230104823079876,
-        "chrf": 91.69043412576788,
-        "accuracy": 0.7,
-        "mlm": 0.9632049588292643,
-        "overall_score": 0.8287384803790839
       }
     ],
-    "bleu": 0.7762400445002428,
-    "chrf": 85.24264055251147,
-    "accuracy": 0.6533333333333333,
-    "mlm": 0.9380739382708239,
-    "overall_score": 0.7892157720348,
     "commonvoice_hours": 2651.0,
     "commonvoice_locale": "en",
     "population": {
@@ -217,18 +217,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.3977775857451761,
-        "chrf": 57.672913792439125,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.926731451729437,
-        "overall_score": 0.6303919013804266
       }
     ],
-    "bleu": 0.3977775857451761,
-    "chrf": 57.672913792439125,
-    "accuracy": 0.5666666666666667,
-    "mlm": 0.926731451729437,
-    "overall_score": 0.6303919013804266,
     "commonvoice_hours": 422.0,
     "commonvoice_locale": "zh-TW",
     "population": {
@@ -261,18 +261,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.333521621016373,
-        "chrf": 50.48364584189306,
-        "accuracy": 0.5,
-        "mlm": 0.9585976421208252,
-        "overall_score": 0.5973730877123994
       }
     ],
-    "bleu": 0.333521621016373,
-    "chrf": 50.48364584189306,
-    "accuracy": 0.5,
-    "mlm": 0.9585976421208252,
-    "overall_score": 0.5973730877123994,
     "commonvoice_hours": 16.0,
     "commonvoice_locale": "hi-IN",
     "population": {
@@ -291,18 +291,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.29160032861883095,
-        "chrf": 47.668399832701844,
-        "accuracy": 0.5,
-        "mlm": 0.9272973828072317,
-        "overall_score": 0.5729659038086875
       }
     ],
-    "bleu": 0.29160032861883095,
-    "chrf": 47.668399832701844,
-    "accuracy": 0.5,
-    "mlm": 0.9272973828072317,
-    "overall_score": 0.5729659038086875,
     "commonvoice_hours": 446.0,
     "commonvoice_locale": "es",
     "population": {
@@ -354,18 +354,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.277257629790728,
-        "chrf": 46.62779335380641,
-        "accuracy": 0.4666666666666667,
-        "mlm": 0.9617481078420298,
-        "overall_score": 0.5685574680998081
       }
     ],
-    "bleu": 0.277257629790728,
-    "chrf": 46.62779335380641,
-    "accuracy": 0.4666666666666667,
-    "mlm": 0.9617481078420298,
-    "overall_score": 0.5685574680998081,
     "commonvoice_hours": 91.0,
     "commonvoice_locale": "ar",
     "population": {
@@ -416,18 +416,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.2659144372728079,
-        "chrf": 44.14831240898717,
-        "accuracy": 0.43333333333333335,
-        "mlm": 0.9414677321132675,
-        "overall_score": 0.5469051675731363
       }
     ],
-    "bleu": 0.2659144372728079,
-    "chrf": 44.14831240898717,
-    "accuracy": 0.43333333333333335,
-    "mlm": 0.9414677321132675,
-    "overall_score": 0.5469051675731363,
     "commonvoice_hours": 77.0,
     "commonvoice_locale": "ur",
     "population": {
@@ -445,18 +445,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.315663773358301,
-        "chrf": 49.253978669350964,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.960796739893282,
-        "overall_score": 0.6143757266394165
       }
     ],
-    "bleu": 0.315663773358301,
-    "chrf": 49.253978669350964,
-    "accuracy": 0.5666666666666667,
-    "mlm": 0.960796739893282,
-    "overall_score": 0.6143757266394165,
     "commonvoice_hours": 1052.0,
     "commonvoice_locale": "fr",
     "population": {
@@ -531,18 +531,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.21265887286151353,
-        "chrf": 41.501657722373686,
-        "accuracy": 0.4,
-        "mlm": 0.8995272489886615,
-        "overall_score": 0.504062040616725
       }
     ],
-    "bleu": 0.21265887286151353,
-    "chrf": 41.501657722373686,
-    "accuracy": 0.4,
-    "mlm": 0.8995272489886615,
-    "overall_score": 0.504062040616725,
     "commonvoice_hours": 49.0,
     "commonvoice_locale": "bn",
     "population": {
@@ -560,18 +560,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.27514792195783394,
-        "chrf": 45.901248962808694,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.9640739007405215,
-        "overall_score": 0.6019628297883407
       }
     ],
-    "bleu": 0.27514792195783394,
-    "chrf": 45.901248962808694,
-    "accuracy": 0.5666666666666667,
-    "mlm": 0.9640739007405215,
-    "overall_score": 0.6019628297883407,
     "commonvoice_hours": 177.0,
     "commonvoice_locale": "pt",
     "population": {
@@ -600,18 +600,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.3048037308116852,
-        "chrf": 48.4304965568793,
-        "accuracy": 0.5333333333333333,
-        "mlm": 0.9033444436966103,
-        "overall_score": 0.5804938359472096
       }
     ],
-    "bleu": 0.3048037308116852,
-    "chrf": 48.4304965568793,
-    "accuracy": 0.5333333333333333,
-    "mlm": 0.9033444436966103,
-    "overall_score": 0.5804938359472096,
     "commonvoice_hours": 2.3,
     "commonvoice_locale": "pa-IN",
     "population": {
@@ -630,18 +630,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.26108507692625094,
-        "chrf": 45.063308940468154,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.9563400339874765,
-        "overall_score": 0.5946972591934646
       }
     ],
-    "bleu": 0.26108507692625094,
-    "chrf": 45.063308940468154,
-    "accuracy": 0.5666666666666667,
-    "mlm": 0.9563400339874765,
-    "overall_score": 0.5946972591934646,
     "commonvoice_hours": 242.0,
     "commonvoice_locale": "ru",
     "population": {
@@ -677,18 +677,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.2709203338132304,
-        "chrf": 44.36399636969686,
-        "accuracy": 0.5,
-        "mlm": 0.9612351448314987,
-        "overall_score": 0.5773851595482431
       }
     ],
-    "bleu": 0.2709203338132304,
-    "chrf": 44.36399636969686,
-    "accuracy": 0.5,
-    "mlm": 0.9612351448314987,
-    "overall_score": 0.5773851595482431,
     "commonvoice_hours": 411.0,
     "commonvoice_locale": "sw",
     "population": {
@@ -710,18 +710,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.27441353638286026,
-        "chrf": 46.025445629112156,
-        "accuracy": 0.6,
-        "mlm": 0.9465444909745621,
-        "overall_score": 0.6069860091191407
       }
     ],
-    "bleu": 0.27441353638286026,
-    "chrf": 46.025445629112156,
-    "accuracy": 0.6,
-    "mlm": 0.9465444909745621,
-    "overall_score": 0.6069860091191407,
     "commonvoice_hours": 33.0,
     "commonvoice_locale": "id",
     "population": {
@@ -736,18 +736,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.3338682761061998,
-        "chrf": 50.216731068308064,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.9526738506105953,
-        "overall_score": 0.6177362644611538
       }
     ],
-    "bleu": 0.3338682761061998,
-    "chrf": 50.216731068308064,
-    "accuracy": 0.5666666666666667,
-    "mlm": 0.9526738506105953,
-    "overall_score": 0.6177362644611538,
     "commonvoice_hours": 1358.0,
     "commonvoice_locale": "de",
     "population": {
@@ -787,18 +787,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.2940100667664714,
-        "chrf": 46.403097021492236,
-        "accuracy": 0.6,
-        "mlm": 0.9337910001211718,
-        "overall_score": 0.609267022295881
       }
     ],
-    "bleu": 0.2940100667664714,
-    "chrf": 46.403097021492236,
-    "accuracy": 0.6,
-    "mlm": 0.9337910001211718,
-    "overall_score": 0.609267022295881,
     "commonvoice_hours": 222.0,
     "commonvoice_locale": "ja",
     "population": {
@@ -814,18 +814,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.2750887189010237,
-        "chrf": 46.31463752811596,
-        "accuracy": 0.4,
-        "mlm": 0.9359077032699009,
-        "overall_score": 0.5369988073903081
       }
     ],
-    "bleu": 0.2750887189010237,
-    "chrf": 46.31463752811596,
-    "accuracy": 0.4,
-    "mlm": 0.9359077032699009,
-    "overall_score": 0.5369988073903081,
     "commonvoice_hours": 0.3,
     "commonvoice_locale": "te",
     "population": {
@@ -839,18 +839,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.2584800238292114,
-        "chrf": 44.69889855306244,
-        "accuracy": 0.5666666666666667,
-        "mlm": 0.9351731522339883,
-        "overall_score": 0.5867732809099554
       }
     ],
-    "bleu": 0.2584800238292114,
-    "chrf": 44.69889855306244,
-    "accuracy": 0.5666666666666667,
-    "mlm": 0.9351731522339883,
-    "overall_score": 0.5867732809099554,
     "commonvoice_hours": 20.0,
     "commonvoice_locale": "mr",
     "population": {
@@ -864,18 +864,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.23082586428104943,
-        "chrf": 41.42591471734489,
-        "accuracy": 0.4666666666666667,
-        "mlm": 0.9453687616674971,
-        "overall_score": 0.5476204308717377
       }
     ],
-    "bleu": 0.23082586428104943,
-    "chrf": 41.42591471734489,
-    "accuracy": 0.4666666666666667,
-    "mlm": 0.9453687616674971,
-    "overall_score": 0.5476204308717377,
     "commonvoice_hours": 0.0,
     "commonvoice_locale": "jv",
     "population": {
@@ -890,18 +890,18 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.252552287345529,
-        "chrf": 43.351007120897606,
-        "accuracy": 0.5333333333333333,
-        "mlm": 0.9638175194388952,
-        "overall_score": 0.5832343800392524
       }
     ],
-    "bleu": 0.252552287345529,
-    "chrf": 43.351007120897606,
-    "accuracy": 0.5333333333333333,
-    "mlm": 0.9638175194388952,
-    "overall_score": 0.5832343800392524,
     "commonvoice_hours": 5.9,
     "commonvoice_locale": "vi",
     "population": {

     "scores": [
       {
         "model": "openai/gpt-4o-mini",
+        "mt_bleu": 0.5245466124037277,
+        "mt_chrf": 65.25187717981981,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 97.84704595784264,
+        "overall_score": 0.7325519660144305
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.48750797044187216,
+        "mt_chrf": 63.24229348441665,
+        "cls_acc": 0.6,
+        "mlm_chrf": 93.62602669879945,
+        "overall_score": 0.7228944006107203
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
+        "mt_bleu": 0.486501959595472,
+        "mt_chrf": 63.8187259254881,
+        "cls_acc": 0.5333333333333333,
+        "mlm_chrf": 79.91140615317198,
+        "overall_score": 0.656878218039978
       },
       {
         "model": "google/gemini-2.0-flash-001",
+        "mt_bleu": 0.6060954569411976,
+        "mt_chrf": 71.2288943066563,
+        "cls_acc": 0.8666666666666667,
+        "mlm_chrf": 98.79868693366329,
+        "overall_score": 0.8556474930232877
       },
       {
         "model": "microsoft/phi-4",
+        "mt_bleu": 0.5199836121545649,
+        "mt_chrf": 66.05410510011644,
+        "cls_acc": 0.7,
+        "mlm_chrf": 97.2290729316734,
+        "overall_score": 0.7776105934392995
       }
     ],
+    "mt_bleu": 0.5249271223073668,
+    "mt_chrf": 65.91917919929946,
+    "cls_acc": 0.6533333333333333,
+    "mlm_chrf": 93.48244773503015,
+    "overall_score": 0.7491165342255433,
     "commonvoice_hours": 2651.0,
     "commonvoice_locale": "en",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.38557580495281013,
+        "mt_chrf": 61.11151378837755,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 94.55849047452216,
+        "overall_score": 0.7077889030985546
       }
     ],
+    "mt_bleu": 0.38557580495281013,
+    "mt_chrf": 61.11151378837755,
+    "cls_acc": 0.5666666666666667,
+    "mlm_chrf": 94.55849047452216,
+    "overall_score": 0.7077889030985546,
     "commonvoice_hours": 422.0,
     "commonvoice_locale": "zh-TW",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.32404902340686065,
+        "mt_chrf": 53.54085104449268,
+        "cls_acc": 0.5,
+        "mlm_chrf": 96.17240172798218,
+        "overall_score": 0.6657108425749162
       }
     ],
+    "mt_bleu": 0.32404902340686065,
+    "mt_chrf": 53.54085104449268,
+    "cls_acc": 0.5,
+    "mlm_chrf": 96.17240172798218,
+    "overall_score": 0.6657108425749162,
     "commonvoice_hours": 16.0,
     "commonvoice_locale": "hi-IN",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.31587937116142056,
+        "mt_chrf": 52.142851262301726,
+        "cls_acc": 0.5,
+        "mlm_chrf": 96.92768852306384,
+        "overall_score": 0.6635684659512185
       }
     ],
+    "mt_bleu": 0.31587937116142056,
+    "mt_chrf": 52.142851262301726,
+    "cls_acc": 0.5,
+    "mlm_chrf": 96.92768852306384,
+    "overall_score": 0.6635684659512185,
     "commonvoice_hours": 446.0,
     "commonvoice_locale": "es",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.39547934933771334,
+        "mt_chrf": 57.51652731936118,
+        "cls_acc": 0.4666666666666667,
+        "mlm_chrf": 94.97026443937914,
+        "overall_score": 0.6638448614180232
       }
     ],
+    "mt_bleu": 0.39547934933771334,
+    "mt_chrf": 57.51652731936118,
+    "cls_acc": 0.4666666666666667,
+    "mlm_chrf": 94.97026443937914,
+    "overall_score": 0.6638448614180232,
     "commonvoice_hours": 91.0,
     "commonvoice_locale": "ar",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.26585004461425726,
+        "mt_chrf": 47.37157150967947,
+        "cls_acc": 0.43333333333333335,
+        "mlm_chrf": 94.38802161979918,
+        "overall_score": 0.6169764215427066
       }
     ],
+    "mt_bleu": 0.26585004461425726,
+    "mt_chrf": 47.37157150967947,
+    "cls_acc": 0.43333333333333335,
+    "mlm_chrf": 94.38802161979918,
+    "overall_score": 0.6169764215427066,
     "commonvoice_hours": 77.0,
     "commonvoice_locale": "ur",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.3510210872150948,
+        "mt_chrf": 55.795595938804894,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 97.12318847922649,
+        "overall_score": 0.6986181702823268
       }
     ],
+    "mt_bleu": 0.3510210872150948,
+    "mt_chrf": 55.795595938804894,
+    "cls_acc": 0.5666666666666667,
+    "mlm_chrf": 97.12318847922649,
+    "overall_score": 0.6986181702823268,
     "commonvoice_hours": 1052.0,
     "commonvoice_locale": "fr",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.2874920154082786,
+        "mt_chrf": 51.49279116112809,
+        "cls_acc": 0.4,
+        "mlm_chrf": 90.6067262108039,
+        "overall_score": 0.6069983912397733
       }
     ],
+    "mt_bleu": 0.2874920154082786,
+    "mt_chrf": 51.49279116112809,
+    "cls_acc": 0.4,
+    "mlm_chrf": 90.6067262108039,
+    "overall_score": 0.6069983912397733,
     "commonvoice_hours": 49.0,
     "commonvoice_locale": "bn",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.33491649454450034,
+        "mt_chrf": 54.60211868234021,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 96.52676764996336,
+        "overall_score": 0.6926518433299008
       }
     ],
+    "mt_bleu": 0.33491649454450034,
+    "mt_chrf": 54.60211868234021,
+    "cls_acc": 0.5666666666666667,
+    "mlm_chrf": 96.52676764996336,
+    "overall_score": 0.6926518433299008,
     "commonvoice_hours": 177.0,
     "commonvoice_locale": "pt",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.3078917767345886,
+        "mt_chrf": 50.505686987696365,
+        "cls_acc": 0.5333333333333333,
+        "mlm_chrf": 90.10119297923285,
+        "overall_score": 0.6464673776675418
       }
     ],
+    "mt_bleu": 0.3078917767345886,
+    "mt_chrf": 50.505686987696365,
+    "cls_acc": 0.5333333333333333,
+    "mlm_chrf": 90.10119297923285,
+    "overall_score": 0.6464673776675418,
     "commonvoice_hours": 2.3,
     "commonvoice_locale": "pa-IN",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.32647288591882895,
+        "mt_chrf": 53.107657805277526,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 96.21400287169976,
+        "overall_score": 0.6866277578121466
       }
     ],
+    "mt_bleu": 0.32647288591882895,
+    "mt_chrf": 53.107657805277526,
+    "cls_acc": 0.5666666666666667,
+    "mlm_chrf": 96.21400287169976,
+    "overall_score": 0.6866277578121466,
     "commonvoice_hours": 242.0,
     "commonvoice_locale": "ru",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.29267168415814176,
+        "mt_chrf": 49.16720485265401,
+        "cls_acc": 0.5,
+        "mlm_chrf": 94.82776161604177,
+        "overall_score": 0.646649888228986
       }
     ],
+    "mt_bleu": 0.29267168415814176,
+    "mt_chrf": 49.16720485265401,
+    "cls_acc": 0.5,
+    "mlm_chrf": 94.82776161604177,
+    "overall_score": 0.646649888228986,
     "commonvoice_hours": 411.0,
     "commonvoice_locale": "sw",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.30782604302717903,
+        "mt_chrf": 52.62467814017025,
+        "cls_acc": 0.6,
+        "mlm_chrf": 95.83373661382923,
+        "overall_score": 0.6948613825133316
       }
     ],
+    "mt_bleu": 0.30782604302717903,
+    "mt_chrf": 52.62467814017025,
+    "cls_acc": 0.6,
+    "mlm_chrf": 95.83373661382923,
+    "overall_score": 0.6948613825133316,
     "commonvoice_hours": 33.0,
     "commonvoice_locale": "id",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.3880450110946665,
+        "mt_chrf": 57.659717194572515,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 96.78268690494019,
+        "overall_score": 0.7036969025539311
       }
     ],
+    "mt_bleu": 0.3880450110946665,
+    "mt_chrf": 57.659717194572515,
+    "cls_acc": 0.5666666666666667,
+    "mlm_chrf": 96.78268690494019,
+    "overall_score": 0.7036969025539311,
     "commonvoice_hours": 1358.0,
     "commonvoice_locale": "de",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.2478415746367755,
+        "mt_chrf": 47.19744231900874,
+        "cls_acc": 0.6,
+        "mlm_chrf": 92.47052714876749,
+        "overall_score": 0.6655598982259208
       }
     ],
+    "mt_bleu": 0.2478415746367755,
+    "mt_chrf": 47.19744231900874,
+    "cls_acc": 0.6,
+    "mlm_chrf": 92.47052714876749,
+    "overall_score": 0.6655598982259208,
     "commonvoice_hours": 222.0,
     "commonvoice_locale": "ja",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.3785489421990512,
+        "mt_chrf": 56.9267557487146,
+        "cls_acc": 0.4,
+        "mlm_chrf": 94.3625059002704,
+        "overall_score": 0.6376308721632834
       }
     ],
+    "mt_bleu": 0.3785489421990512,
+    "mt_chrf": 56.9267557487146,
+    "cls_acc": 0.4,
+    "mlm_chrf": 94.3625059002704,
+    "overall_score": 0.6376308721632834,
     "commonvoice_hours": 0.3,
     "commonvoice_locale": "te",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.29576799752528954,
+        "mt_chrf": 51.55512571221437,
+        "cls_acc": 0.5666666666666667,
+        "mlm_chrf": 95.03930657100632,
+        "overall_score": 0.6775369964996245
       }
     ],
+    "mt_bleu": 0.29576799752528954,
+    "mt_chrf": 51.55512571221437,
+    "cls_acc": 0.5666666666666667,
+    "mlm_chrf": 95.03930657100632,
+    "overall_score": 0.6775369964996245,
     "commonvoice_hours": 20.0,
     "commonvoice_locale": "mr",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.26767127029757953,
+        "mt_chrf": 48.9335568346396,
+        "cls_acc": 0.4666666666666667,
+        "mlm_chrf": 91.68807278010077,
+        "overall_score": 0.6242943209380235
       }
     ],
+    "mt_bleu": 0.26767127029757953,
+    "mt_chrf": 48.9335568346396,
+    "cls_acc": 0.4666666666666667,
+    "mlm_chrf": 91.68807278010077,
+    "overall_score": 0.6242943209380235,
     "commonvoice_hours": 0.0,
     "commonvoice_locale": "jv",
     "population": {
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "mt_bleu": 0.26736329890789995,
+        "mt_chrf": 49.52763533189073,
+        "cls_acc": 0.5333333333333333,
+        "mlm_chrf": 94.33244905535389,
+        "overall_score": 0.6573113924019266
       }
     ],
+    "mt_bleu": 0.26736329890789995,
+    "mt_chrf": 49.52763533189073,
+    "cls_acc": 0.5333333333333333,
+    "mlm_chrf": 94.33244905535389,
+    "overall_score": 0.6573113924019266,
     "commonvoice_hours": 5.9,
     "commonvoice_locale": "vi",
     "population": {