Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

David Pomerenke commited on Feb 22

Commit

56081d8

1 Parent(s): 8190782

Parallelize everything, select most populous script

Browse files

Files changed (2) hide show

evals.py +91 -86
results.json +456 -64

evals.py CHANGED Viewed

@@ -2,6 +2,7 @@ import asyncio
 import json
 import os
 import re
 from os import getenv
 import evaluate
@@ -10,13 +11,12 @@ import requests
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
 from joblib.memory import Memory
 from openai import AsyncOpenAI
 from tqdm.asyncio import tqdm_asyncio
 from transformers import NllbTokenizer
-from datetime import date
-from requests import get
-from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
-from langcodes import standardize_tag, Language
 # config
 models = [
@@ -40,17 +40,11 @@ client = AsyncOpenAI(
 )
 cache = Memory(location=".cache", verbose=0).cache
 bleu = evaluate.load("bleu")
-bertscore = evaluate.load("bertscore")
 tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 rate_limit = AsyncLimiter(max_rate=20, time_period=1)
-def reorder(language_name):
-    if "," in language_name and "(" not in language_name:
-        return language_name.split(",")[1] + " " + language_name.split(",")[0]
-    return language_name
 # load general language data
 languages = {
     lang: pop
@@ -58,7 +52,9 @@ languages = {
     if not re.match(r".*-[A-Z]{2}$", lang)
 }
 languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
-languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name())
 # load script codes and names
 scripts = pd.read_csv("data/ScriptCodes.csv").rename(
@@ -70,15 +66,26 @@ def script_name(iso15924):
     return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
 # load benchmark languages and scripts
 benchmark_dir = "data/floresp-v2.0-rc.3/dev"
 benchmark_languages = pd.DataFrame(
-    [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
-    columns=["iso639_3", "iso15924"],
 )
-benchmark_languages["bcp_47"] = benchmark_languages.apply(
-    lambda row: standardize_tag(row["iso639_3"] + "-" + row["iso15924"], macro=True),
-    axis=1,
 )
 # ignore script (language is language)
 benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
@@ -86,7 +93,7 @@ benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
 )
 benchmark_languages = (
     benchmark_languages.groupby("bcp_47")
-    .agg({"iso639_3": "first", "iso15924": "first"})
     .reset_index()
 )
@@ -123,14 +130,14 @@ languages = pd.merge(
 languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
 languages = languages.sort_values(by="speakers", ascending=False)
-languages = languages.iloc[:10]
 # sample languages to translate to
 target_languages = languages[languages["in_benchmark"]].sample(
     n=n_sentences, weights="speakers", replace=True, random_state=42
 )
 # sample languages to analyze with all models
-detailed_languages = languages[languages["in_benchmark"]].sample(n=3, random_state=42)
 # utils
@@ -158,93 +165,91 @@ async def complete(**kwargs):
     return response
-async def translate(model, target_language, sentence):
-    script = script_name(target_language.iso15924)
     reply = await complete(
         model=model,
         messages=[
             {
                 "role": "user",
-                "content": f"Translate the following text to the {target_language.name} language; use the {script} script; reply only with the translation:\n\n{sentence}",
             }
         ],
         temperature=0,
         max_tokens=1024,
     )
-    return reply.choices[0].message.content
-def mean(l):
-    return sum(l) / len(l) if l else 0
-def load_sentences(language):
-    return open(
-        f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}"
-    ).readlines()
 # evaluation!
 async def main():
     results = []
-    for language in list(languages.itertuples()):
-        scores = []
-        if language.in_benchmark:
-            original_sentences = load_sentences(language)[:n_sentences]
-            for model in models:
-                if (
-                    model != fast_model
-                    and language.bcp_47 not in detailed_languages.bcp_47.values
-                ):
-                    continue
-                predictions = [
-                    translate(
-                        model,
-                        language,
-                        sentence,
-                    )
-                    for sentence, language in zip(
-                        original_sentences, target_languages.itertuples()
-                    )
-                ]
-                predictions = await tqdm_asyncio.gather(
-                    *predictions,
-                    miniters=1,
-                    desc=f"{language.name} {model.split('/')[0]}",
-                )
-                target_sentences = [
-                    load_sentences(lang)[i]
-                    for i, lang in enumerate(target_languages.itertuples())
-                ]
-                metrics_bleu = bleu.compute(
-                    predictions=predictions,
-                    references=target_sentences,
-                    tokenizer=tokenizer.tokenize,
-                )
-                # metrics_bert = bertscore.compute(
-                #     predictions=predictions,
-                #     references=target_sentences,
-                #     model_type="distilbert-base-uncased",
-                # )
-                scores.append(
                     {
                         "model": model,
-                        "bleu": metrics_bleu["bleu"],
-                        # "bert_score": mean(metrics_bert["f1"]),
                     }
                 )
-        results.append(
-            {
-                "language_name": language.name,
-                "bcp_47": language.bcp_47,
-                "speakers": language.speakers if not pd.isna(language.speakers) else 0,
-                "scores": scores,
-                "bleu": mean([s["bleu"] for s in scores]) if scores else None,
-                # "bert_score": mean([s["bert_score"] for s in scores]),
-                "commonvoice_hours": language.commonvoice_hours,
-                "commonvoice_locale": language.commonvoice_locale,
-            }
-        )
     with open("results.json", "w") as f:
         json.dump(results, f, indent=2, ensure_ascii=False)

 import json
 import os
 import re
+from datetime import date
 from os import getenv
 import evaluate
 from aiolimiter import AsyncLimiter
 from dotenv import load_dotenv
 from joblib.memory import Memory
+from langcodes import Language, standardize_tag
+from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
 from openai import AsyncOpenAI
+from requests import get
 from tqdm.asyncio import tqdm_asyncio
 from transformers import NllbTokenizer
 # config
 models = [
 )
 cache = Memory(location=".cache", verbose=0).cache
 bleu = evaluate.load("bleu")
+# bertscore = evaluate.load("bertscore")
 tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 rate_limit = AsyncLimiter(max_rate=20, time_period=1)
 # load general language data
 languages = {
     lang: pop
     if not re.match(r".*-[A-Z]{2}$", lang)
 }
 languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
+languages["language_name"] = languages["bcp_47"].apply(
+    lambda x: Language.get(x).display_name()
+)
 # load script codes and names
 scripts = pd.read_csv("data/ScriptCodes.csv").rename(
     return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
+def aggregate_flores_paths(flores_paths):
+    # takes a list of paths from the same language but different scripts
+    # returns the one with the largest writing population
+    if len(flores_paths) == 1:
+        return flores_paths.values[0]
+    populations = [
+        Language.get(standardize_tag(x, macro=True)).writing_population()
+        for x in flores_paths.values
+    ]
+    return flores_paths.values[populations.index(max(populations))]
 # load benchmark languages and scripts
 benchmark_dir = "data/floresp-v2.0-rc.3/dev"
 benchmark_languages = pd.DataFrame(
+    [f.split(".")[1] for f in os.listdir(benchmark_dir)],
+    columns=["flores_path"],
 )
+benchmark_languages["bcp_47"] = benchmark_languages["flores_path"].apply(
+    lambda x: standardize_tag(x, macro=True),
 )
 # ignore script (language is language)
 benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
 )
 benchmark_languages = (
     benchmark_languages.groupby("bcp_47")
+    .agg({"flores_path": aggregate_flores_paths})
     .reset_index()
 )
 languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
 languages = languages.sort_values(by="speakers", ascending=False)
+languages = languages.iloc[:30]
 # sample languages to translate to
 target_languages = languages[languages["in_benchmark"]].sample(
     n=n_sentences, weights="speakers", replace=True, random_state=42
 )
 # sample languages to analyze with all models
+detailed_languages = languages[languages["in_benchmark"]].sample(n=10, random_state=42)
 # utils
     return response
+def load_sentences(language):
+    return open(f"{benchmark_dir}/dev.{language.flores_path}").readlines()
+@cache
+async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
+    original_language = languages[languages["bcp_47"] == original_language_bcp_47].iloc[
+        0
+    ]
+    target_language = target_languages.iloc[sentence_nr]
+    original_sentence = load_sentences(original_language)[sentence_nr].strip()
+    target_sentence = load_sentences(target_language)[sentence_nr].strip()
+    script = script_name(target_language.flores_path.split("_")[1])
     reply = await complete(
         model=model,
         messages=[
             {
                 "role": "user",
+                "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
             }
         ],
         temperature=0,
         max_tokens=1024,
     )
+    prediction = reply.choices[0].message.content.strip()
+    score = bleu.compute(
+        predictions=[prediction],
+        references=[target_sentence],
+        tokenizer=tokenizer.tokenize,
+    )
+    return {
+        "model": model,
+        "bcp_47": original_language["bcp_47"],
+        "bleu": score["bleu"],
+        "sentence_nr": sentence_nr,
+    }
+def mean(lst):
+    return sum(lst) / len(lst) if lst else 0
 # evaluation!
 async def main():
+    scores = [
+        translate_and_evaluate(model, original_language.bcp_47, i)
+        for i in range(n_sentences)
+        for original_language in languages.itertuples()
+        for model in models
+        if original_language.in_benchmark
+        and (
+            model == fast_model
+            or original_language.bcp_47 in detailed_languages.bcp_47.values
+        )
+    ]
+    scores = await tqdm_asyncio.gather(*scores, miniters=1)
     results = []
+    for language in languages.itertuples():
+        results_for_language = []
+        for model in models:
+            results_for_model = [
+                score
+                for score in scores
+                if score["bcp_47"] == language.bcp_47 and score["model"] == model
+            ]
+            if results_for_model:
+                bleu = mean([s["bleu"] for s in results_for_model])
+                results_for_language.append(
                     {
                         "model": model,
+                        "bleu": bleu,
                     }
                 )
+        if results_for_language:
+            results.append(
+                {
+                    "language_name": language.language_name,
+                    "bcp_47": language.bcp_47,
+                    "speakers": language.speakers,
+                    "scores": results_for_language,
+                    "bleu": mean([s["bleu"] for s in results_for_language]),
+                    "commonvoice_hours": language.commonvoice_hours,
+                    "commonvoice_locale": language.commonvoice_locale,
+                }
+            )
     with open("results.json", "w") as f:
         json.dump(results, f, indent=2, ensure_ascii=False)

results.json CHANGED Viewed

@@ -3,49 +3,49 @@
     "language_name": "English",
     "bcp_47": "en",
     "speakers": 1636485840,
-    "scores": [
-      {
-        "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.4931825583688982
-      }
-    ],
-    "bleu": 0.4931825583688982,
-    "commonvoice_hours": 2649.0,
-    "commonvoice_locale": "en"
-  },
-  {
-    "language_name": "Chinese",
-    "bcp_47": "zh",
-    "speakers": 1304678914,
     "scores": [
       {
         "model": "openai/gpt-4o-mini",
-        "bleu": 0.4807599914028467
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.48224897154012053
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
-        "bleu": 0.2688927547323512
       },
       {
         "model": "google/gemini-2.0-flash-001",
-        "bleu": 0.4876059353172742
       },
       {
         "model": "deepseek/deepseek-chat",
-        "bleu": 0.46126489333496423
       },
       {
         "model": "microsoft/phi-4",
-        "bleu": 0.43306718920654086
       }
     ],
-    "bleu": 0.4356399559223496,
     "commonvoice_hours": 422.0,
-    "commonvoice_locale": "zh-TW"
   },
   {
     "language_name": "Hindi",
@@ -54,12 +54,12 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.42910938007537924
       }
     ],
-    "bleu": 0.42910938007537924,
     "commonvoice_hours": 16.0,
-    "commonvoice_locale": "hi-IN"
   },
   {
     "language_name": "Spanish",
@@ -68,10 +68,10 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.3335615012680206
       }
     ],
-    "bleu": 0.3335615012680206,
     "commonvoice_hours": 446.0,
     "commonvoice_locale": "es"
   },
@@ -82,10 +82,10 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.19072998559991275
       }
     ],
-    "bleu": 0.19072998559991275,
     "commonvoice_hours": 91.0,
     "commonvoice_locale": "ar"
   },
@@ -94,32 +94,12 @@
     "bcp_47": "ur",
     "speakers": 290790290,
     "scores": [
-      {
-        "model": "openai/gpt-4o-mini",
-        "bleu": 0.3223557428811336
-      },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.3361392064611452
-      },
-      {
-        "model": "mistralai/mistral-small-24b-instruct-2501",
-        "bleu": 0.30361668093990973
-      },
-      {
-        "model": "google/gemini-2.0-flash-001",
-        "bleu": 0.38811035932918286
-      },
-      {
-        "model": "deepseek/deepseek-chat",
-        "bleu": 0.33221997814253806
-      },
-      {
-        "model": "microsoft/phi-4",
-        "bleu": 0.2541447606474814
       }
     ],
-    "bleu": 0.32276445473356513,
     "commonvoice_hours": 76.0,
     "commonvoice_locale": "ur"
   },
@@ -130,10 +110,10 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.40595466651226686
       }
     ],
-    "bleu": 0.40595466651226686,
     "commonvoice_hours": 1051.0,
     "commonvoice_locale": "fr"
   },
@@ -144,10 +124,10 @@
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.30570858536443696
       }
     ],
-    "bleu": 0.30570858536443696,
     "commonvoice_hours": 49.0,
     "commonvoice_locale": "bn"
   },
@@ -158,30 +138,30 @@
     "scores": [
       {
         "model": "openai/gpt-4o-mini",
-        "bleu": 0.4122096638493346
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.39250552075952033
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
-        "bleu": 0.22643923104785263
       },
       {
         "model": "google/gemini-2.0-flash-001",
-        "bleu": 0.42197093736929103
       },
       {
         "model": "deepseek/deepseek-chat",
-        "bleu": 0.42783260235353093
       },
       {
         "model": "microsoft/phi-4",
-        "bleu": 0.38611444119797594
       }
     ],
-    "bleu": 0.3778453994295843,
     "commonvoice_hours": 176.0,
     "commonvoice_locale": "pt"
   },
@@ -190,13 +170,425 @@
     "bcp_47": "pa",
     "speakers": 203571210,
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
-        "bleu": 0.34311946995454473
       }
     ],
-    "bleu": 0.34311946995454473,
     "commonvoice_hours": 2.3,
     "commonvoice_locale": "pa-IN"
   }
 ]

     "language_name": "English",
     "bcp_47": "en",
     "speakers": 1636485840,
     "scores": [
       {
         "model": "openai/gpt-4o-mini",
+        "bleu": 0.5292544231540742
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.465648126623753
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.478174166015779
       },
       {
         "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.5266708610727185
       },
       {
         "model": "deepseek/deepseek-chat",
+        "bleu": 0.5549134525314846
       },
       {
         "model": "microsoft/phi-4",
+        "bleu": 0.4668163276973811
+      }
+    ],
+    "bleu": 0.5035795595158651,
+    "commonvoice_hours": 2649.0,
+    "commonvoice_locale": "en"
+  },
+  {
+    "language_name": "Chinese",
+    "bcp_47": "zh",
+    "speakers": 1304678914,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.35763875438716014
       }
     ],
+    "bleu": 0.35763875438716014,
     "commonvoice_hours": 422.0,
+    "commonvoice_locale": "zh-HK"
   },
   {
     "language_name": "Hindi",
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.33760351976648345
       }
     ],
+    "bleu": 0.33760351976648345,
     "commonvoice_hours": 16.0,
+    "commonvoice_locale": "hi"
   },
   {
     "language_name": "Spanish",
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3600460831160618
       }
     ],
+    "bleu": 0.3600460831160618,
     "commonvoice_hours": 446.0,
     "commonvoice_locale": "es"
   },
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3046598747480405
       }
     ],
+    "bleu": 0.3046598747480405,
     "commonvoice_hours": 91.0,
     "commonvoice_locale": "ar"
   },
     "bcp_47": "ur",
     "speakers": 290790290,
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.331647033312127
       }
     ],
+    "bleu": 0.331647033312127,
     "commonvoice_hours": 76.0,
     "commonvoice_locale": "ur"
   },
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3141809404018014
       }
     ],
+    "bleu": 0.3141809404018014,
     "commonvoice_hours": 1051.0,
     "commonvoice_locale": "fr"
   },
     "scores": [
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.27472181972977344
       }
     ],
+    "bleu": 0.27472181972977344,
     "commonvoice_hours": 49.0,
     "commonvoice_locale": "bn"
   },
     "scores": [
       {
         "model": "openai/gpt-4o-mini",
+        "bleu": 0.36418677020025814
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.36847793827413045
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.33146858114564615
       },
       {
         "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.3685111782334586
       },
       {
         "model": "deepseek/deepseek-chat",
+        "bleu": 0.41976380092637283
       },
       {
         "model": "microsoft/phi-4",
+        "bleu": 0.35431476252948624
       }
     ],
+    "bleu": 0.367787171884892,
     "commonvoice_hours": 176.0,
     "commonvoice_locale": "pt"
   },
     "bcp_47": "pa",
     "speakers": 203571210,
     "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.29479385926490154
+      },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.34372813238670347
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.24553184949811938
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.3934178960662497
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.3489400123993954
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.26926813301032654
       }
     ],
+    "bleu": 0.31594664710428266,
     "commonvoice_hours": 2.3,
     "commonvoice_locale": "pa-IN"
+  },
+  {
+    "language_name": "Russian",
+    "bcp_47": "ru",
+    "speakers": 195841151,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2920291935463745
+      }
+    ],
+    "bleu": 0.2920291935463745,
+    "commonvoice_hours": 241.0,
+    "commonvoice_locale": "ru"
+  },
+  {
+    "language_name": "Swahili",
+    "bcp_47": "sw",
+    "speakers": 171610296,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.3240516590412694
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3021494866906426
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.21392015063903014
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.39351510575974585
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.32036034973159405
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.2572750657835761
+      }
+    ],
+    "bleu": 0.3018786362743097,
+    "commonvoice_hours": 411.0,
+    "commonvoice_locale": "sw"
+  },
+  {
+    "language_name": "Indonesian",
+    "bcp_47": "id",
+    "speakers": 171207687,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.31923635687963403
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.32764790212460226
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.2387340248344293
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.36831341439353155
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.3614031163582736
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.2526105547535859
+      }
+    ],
+    "bleu": 0.31132422822400946,
+    "commonvoice_hours": 33.0,
+    "commonvoice_locale": "id"
+  },
+  {
+    "language_name": "German",
+    "bcp_47": "de",
+    "speakers": 136350226,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.39299196408709347
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3886659265736507
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.35731041330816654
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.46630655663486287
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.4373279553229372
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.353010712972096
+      }
+    ],
+    "bleu": 0.3992689214831344,
+    "commonvoice_hours": 1357.0,
+    "commonvoice_locale": "de"
+  },
+  {
+    "language_name": "Japanese",
+    "bcp_47": "ja",
+    "speakers": 119729026,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2954810072264808
+      }
+    ],
+    "bleu": 0.2954810072264808,
+    "commonvoice_hours": 222.0,
+    "commonvoice_locale": "ja"
+  },
+  {
+    "language_name": "Telugu",
+    "bcp_47": "te",
+    "speakers": 95478480,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.37949545228579734
+      }
+    ],
+    "bleu": 0.37949545228579734,
+    "commonvoice_hours": 0.3,
+    "commonvoice_locale": "te"
+  },
+  {
+    "language_name": "Marathi",
+    "bcp_47": "mr",
+    "speakers": 92826300,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2852384896861461
+      }
+    ],
+    "bleu": 0.2852384896861461,
+    "commonvoice_hours": 20.0,
+    "commonvoice_locale": "mr"
+  },
+  {
+    "language_name": "Javanese",
+    "bcp_47": "jv",
+    "speakers": 91180665,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.2755399920693052
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2494035065095152
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.1266725662438766
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.35614761567604236
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.29069945440951733
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.20468330413608699
+      }
+    ],
+    "bleu": 0.2505244065073906,
+    "commonvoice_hours": 0.0,
+    "commonvoice_locale": "jv"
+  },
+  {
+    "language_name": "Vietnamese",
+    "bcp_47": "vi",
+    "speakers": 86222962,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2956750563565745
+      }
+    ],
+    "bleu": 0.2956750563565745,
+    "commonvoice_hours": 5.9,
+    "commonvoice_locale": "vi"
+  },
+  {
+    "language_name": "Tamil",
+    "bcp_47": "ta",
+    "speakers": 85616159,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.27547489589987734
+      }
+    ],
+    "bleu": 0.27547489589987734,
+    "commonvoice_hours": 234.0,
+    "commonvoice_locale": "ta"
+  },
+  {
+    "language_name": "Persian",
+    "bcp_47": "fa",
+    "speakers": 84710459,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2858012364771329
+      }
+    ],
+    "bleu": 0.2858012364771329,
+    "commonvoice_hours": 370.0,
+    "commonvoice_locale": "fa"
+  },
+  {
+    "language_name": "Turkish",
+    "bcp_47": "tr",
+    "speakers": 80360704,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.32005697883543305
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3128582218784996
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.26166377989267786
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.3488811534537982
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.352126761953689
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.22855630110633351
+      }
+    ],
+    "bleu": 0.30402386618673855,
+    "commonvoice_hours": 127.0,
+    "commonvoice_locale": "tr"
+  },
+  {
+    "language_name": "Cantonese",
+    "bcp_47": "yue",
+    "speakers": 79654759,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.25523473174207373
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.2901127503841879
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.23880603698191288
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.33330775674699475
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.30942219437451896
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.25167599008414604
+      }
+    ],
+    "bleu": 0.27975991005230577,
+    "commonvoice_hours": 203.0,
+    "commonvoice_locale": "yue"
+  },
+  {
+    "language_name": "Korean",
+    "bcp_47": "ko",
+    "speakers": 78357046,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.24501349273295708
+      }
+    ],
+    "bleu": 0.24501349273295708,
+    "commonvoice_hours": 1.7,
+    "commonvoice_locale": "ko"
+  },
+  {
+    "language_name": "Italian",
+    "bcp_47": "it",
+    "speakers": 70247060,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3273249067267197
+      }
+    ],
+    "bleu": 0.3273249067267197,
+    "commonvoice_hours": 362.0,
+    "commonvoice_locale": "it"
+  },
+  {
+    "language_name": "Filipino",
+    "bcp_47": "fil",
+    "speakers": 67471096,
+    "scores": [
+      {
+        "model": "openai/gpt-4o-mini",
+        "bleu": 0.35950288667055635
+      },
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.3458571802193247
+      },
+      {
+        "model": "mistralai/mistral-small-24b-instruct-2501",
+        "bleu": 0.2769096553598123
+      },
+      {
+        "model": "google/gemini-2.0-flash-001",
+        "bleu": 0.4030081046637165
+      },
+      {
+        "model": "deepseek/deepseek-chat",
+        "bleu": 0.3712699611966998
+      },
+      {
+        "model": "microsoft/phi-4",
+        "bleu": 0.25550756070033753
+      }
+    ],
+    "bleu": 0.3353425581350746,
+    "commonvoice_hours": 0.0,
+    "commonvoice_locale": "tl"
+  },
+  {
+    "language_name": "Egyptian Arabic",
+    "bcp_47": "arz",
+    "speakers": 66639360,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.23431638822117362
+      }
+    ],
+    "bleu": 0.23431638822117362,
+    "commonvoice_hours": NaN,
+    "commonvoice_locale": NaN
+  },
+  {
+    "language_name": "Gujarati",
+    "bcp_47": "gu",
+    "speakers": 61721799,
+    "scores": [
+      {
+        "model": "meta-llama/llama-3.3-70b-instruct",
+        "bleu": 0.27834507803114356
+      }
+    ],
+    "bleu": 0.27834507803114356,
+    "commonvoice_hours": 0.0,
+    "commonvoice_locale": "gu-IN"
   }
 ]