Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

David Pomerenke commited on Mar 7

Commit

e92634d

1 Parent(s): 1b634f3

Add masked language modeling (MLM) task

Browse files

Files changed (3) hide show

app.py +4 -0
evals.py +72 -2
results.json +84 -42

app.py CHANGED Viewed

@@ -187,6 +187,9 @@ def create_language_stats_df(results):
             "Classi-fication": round(lang["accuracy"], 3)
             if lang["accuracy"] is not None
             else "N/A",
             "Best Model": model_link,
             "CommonVoice Hours": commonvoice_link,
         }
@@ -204,6 +207,7 @@ def create_language_stats_df(results):
             "number", # Overall
             "number", # Translation
             "number", # Classification
             "markdown", # Best Model
             "markdown", # CommonVoice Hours
         ],

             "Classi-fication": round(lang["accuracy"], 3)
             if lang["accuracy"] is not None
             else "N/A",
+            "MLM": round(lang["mlm"], 3)
+            if lang["mlm"] is not None
+            else "N/A",
             "Best Model": model_link,
             "CommonVoice Hours": commonvoice_link,
         }
             "number", # Overall
             "number", # Translation
             "number", # Classification
+            "number", # MLM
             "markdown", # Best Model
             "markdown", # CommonVoice Hours
         ],

evals.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import asyncio
 import json
 import os
 import re
 from datetime import date
 from os import getenv
@@ -216,6 +217,7 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
 metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
 @cache
 async def classify_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
@@ -238,8 +240,10 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
         frac=1, random_state=42
     )
     test_paragraph = test_paragraphs.iloc[nr]
     def topic_to_number(topic):
         return top_topics.get_loc(topic)
     messages = []
     for example in examples.itertuples():
         messages += [
@@ -271,6 +275,52 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
     }
 def mean(lst):
     return sum(lst) / len(lst) if lst else 0
@@ -302,6 +352,16 @@ async def main():
     classification_scores = await tqdm_asyncio.gather(
         *classification_scores, miniters=1
     )
     results = []
     for language in languages.itertuples():
         results_for_language = []
@@ -316,10 +376,16 @@ async def main():
                 for score in classification_scores
                 if score["bcp_47"] == language.bcp_47 and score["model"] == model
             ]
             bleu = mean([s["bleu"] for s in translations_for_model])
             chrf = mean([s["chrf"] for s in translations_for_model])
             accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
-            overall_score = (bleu + accuracy) / 2
             if translations_for_model:
                 results_for_language.append(
                     {
@@ -327,6 +393,7 @@ async def main():
                         "bleu": bleu,
                         "chrf": chrf,
                         "accuracy": accuracy,
                         "overall_score": overall_score,
                     }
                 )
@@ -340,7 +407,10 @@ async def main():
                     "bleu": mean([s["bleu"] for s in results_for_language]),
                     "chrf": mean([s["chrf"] for s in results_for_language]),
                     "accuracy": mean([s["accuracy"] for s in results_for_language]),
-                    "overall_score": mean([s["overall_score"] for s in results_for_language]),
                     "commonvoice_hours": language.commonvoice_hours
                     if not pd.isna(language.commonvoice_hours)
                     else None,

 import asyncio
 import json
 import os
+import random
 import re
 from datetime import date
 from os import getenv
 metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
 @cache
 async def classify_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
         frac=1, random_state=42
     )
     test_paragraph = test_paragraphs.iloc[nr]
     def topic_to_number(topic):
         return top_topics.get_loc(topic)
     messages = []
     for example in examples.itertuples():
         messages += [
     }
+def corrupt_sentence(sentence):
+    # replace 5% of the sentence with <mask>
+    mask_length = round(len(sentence) * 0.05)
+    start = random.randint(0, len(sentence) - mask_length)
+    end = start + mask_length
+    return sentence[:start] + "<mask>" + sentence[end:]
+@cache
+async def mlm_and_evaluate(model, language_bcp_47, nr):
+    language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
+    sentences = pd.DataFrame(load_sentences(language), columns=["text"])
+    sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
+    examples = sentences.sample(n=10, random_state=42)
+    test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
+        frac=1, random_state=42
+    )
+    test_sentence = test_sentences.iloc[nr]
+    messages = []
+    for example in examples.itertuples():
+        messages += [
+            {"role": "user", "content": example.corrupt_text},
+            {"role": "assistant", "content": example.text},
+        ]
+    reply = await complete(
+        model=model,
+        messages=[
+            *messages,
+            {
+                "role": "user",
+                "content": test_sentence.corrupt_text,
+            },
+        ],
+        temperature=0,
+        max_tokens=1024,
+    )
+    prediction = reply.choices[0].message.content.strip()
+    chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
+    return {
+        "model": model,
+        "bcp_47": language["bcp_47"],
+        "chrf": chrf_score["score"],
+        "sentence_nr": nr,
+    }
 def mean(lst):
     return sum(lst) / len(lst) if lst else 0
     classification_scores = await tqdm_asyncio.gather(
         *classification_scores, miniters=1
     )
+    print("evaluate mlm")
+    mlm_scores = [
+        mlm_and_evaluate(model, language.bcp_47, i)
+        for i in range(n_sentences)
+        for language in languages.itertuples()
+        for model in models
+        if language.in_benchmark
+        and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
+    ]
+    mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
     results = []
     for language in languages.itertuples():
         results_for_language = []
                 for score in classification_scores
                 if score["bcp_47"] == language.bcp_47 and score["model"] == model
             ]
+            mlm_for_model = [
+                score
+                for score in mlm_scores
+                if score["bcp_47"] == language.bcp_47 and score["model"] == model
+            ]
             bleu = mean([s["bleu"] for s in translations_for_model])
             chrf = mean([s["chrf"] for s in translations_for_model])
             accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
+            mlm = mean([s["chrf"] for s in mlm_for_model]) / 100
+            overall_score = (bleu + accuracy + mlm) / 3
             if translations_for_model:
                 results_for_language.append(
                     {
                         "bleu": bleu,
                         "chrf": chrf,
                         "accuracy": accuracy,
+                        "mlm": mlm,
                         "overall_score": overall_score,
                     }
                 )
                     "bleu": mean([s["bleu"] for s in results_for_language]),
                     "chrf": mean([s["chrf"] for s in results_for_language]),
                     "accuracy": mean([s["accuracy"] for s in results_for_language]),
+                    "mlm": mean([s["mlm"] for s in results_for_language]),
+                    "overall_score": mean(
+                        [s["overall_score"] for s in results_for_language]
+                    ),
                     "commonvoice_hours": language.commonvoice_hours
                     if not pd.isna(language.commonvoice_hours)
                     else None,

results.json CHANGED Viewed

@@ -9,41 +9,47 @@
         "bleu": 0.89404322120213,
         "chrf": 92.53933977489264,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.7303549439343984
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
         "bleu": 0.4351349353198866,
         "chrf": 54.9504915580248,
         "accuracy": 0.6,
-        "overall_score": 0.5175674676599433
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
         "bleu": 0.8800468872938262,
         "chrf": 94.30164664106223,
         "accuracy": 0.5333333333333333,
-        "overall_score": 0.7066901103135798
       },
       {
         "model": "google/gemini-2.0-flash-001",
         "bleu": 0.8489646963773831,
         "chrf": 92.73129066280984,
         "accuracy": 0.8666666666666667,
-        "overall_score": 0.8578156815220249
       },
       {
         "model": "microsoft/phi-4",
         "bleu": 0.8230104823079876,
         "chrf": 91.69043412576788,
         "accuracy": 0.7,
-        "overall_score": 0.7615052411539938
       }
     ],
     "bleu": 0.7762400445002428,
     "chrf": 85.24264055251147,
     "accuracy": 0.6533333333333333,
-    "overall_score": 0.714786688916788,
     "commonvoice_hours": 2651.0,
     "commonvoice_locale": "en",
     "population": {
@@ -214,13 +220,15 @@
         "bleu": 0.3977775857451761,
         "chrf": 57.672913792439125,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.48222212620592136
       }
     ],
     "bleu": 0.3977775857451761,
     "chrf": 57.672913792439125,
     "accuracy": 0.5666666666666667,
-    "overall_score": 0.48222212620592136,
     "commonvoice_hours": 422.0,
     "commonvoice_locale": "zh-TW",
     "population": {
@@ -256,13 +264,15 @@
         "bleu": 0.333521621016373,
         "chrf": 50.48364584189306,
         "accuracy": 0.5,
-        "overall_score": 0.4167608105081865
       }
     ],
     "bleu": 0.333521621016373,
     "chrf": 50.48364584189306,
     "accuracy": 0.5,
-    "overall_score": 0.4167608105081865,
     "commonvoice_hours": 16.0,
     "commonvoice_locale": "hi-IN",
     "population": {
@@ -284,13 +294,15 @@
         "bleu": 0.29160032861883095,
         "chrf": 47.668399832701844,
         "accuracy": 0.5,
-        "overall_score": 0.39580016430941545
       }
     ],
     "bleu": 0.29160032861883095,
     "chrf": 47.668399832701844,
     "accuracy": 0.5,
-    "overall_score": 0.39580016430941545,
     "commonvoice_hours": 446.0,
     "commonvoice_locale": "es",
     "population": {
@@ -345,13 +357,15 @@
         "bleu": 0.277257629790728,
         "chrf": 46.62779335380641,
         "accuracy": 0.4666666666666667,
-        "overall_score": 0.37196214822869733
       }
     ],
     "bleu": 0.277257629790728,
     "chrf": 46.62779335380641,
     "accuracy": 0.4666666666666667,
-    "overall_score": 0.37196214822869733,
     "commonvoice_hours": 91.0,
     "commonvoice_locale": "ar",
     "population": {
@@ -405,13 +419,15 @@
         "bleu": 0.2659144372728079,
         "chrf": 44.14831240898717,
         "accuracy": 0.43333333333333335,
-        "overall_score": 0.34962388530307065
       }
     ],
     "bleu": 0.2659144372728079,
     "chrf": 44.14831240898717,
     "accuracy": 0.43333333333333335,
-    "overall_score": 0.34962388530307065,
     "commonvoice_hours": 77.0,
     "commonvoice_locale": "ur",
     "population": {
@@ -432,13 +448,15 @@
         "bleu": 0.315663773358301,
         "chrf": 49.253978669350964,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.4411652200124838
       }
     ],
     "bleu": 0.315663773358301,
     "chrf": 49.253978669350964,
     "accuracy": 0.5666666666666667,
-    "overall_score": 0.4411652200124838,
     "commonvoice_hours": 1052.0,
     "commonvoice_locale": "fr",
     "population": {
@@ -516,13 +534,15 @@
         "bleu": 0.21265887286151353,
         "chrf": 41.501657722373686,
         "accuracy": 0.4,
-        "overall_score": 0.3063294364307568
       }
     ],
     "bleu": 0.21265887286151353,
     "chrf": 41.501657722373686,
     "accuracy": 0.4,
-    "overall_score": 0.3063294364307568,
     "commonvoice_hours": 49.0,
     "commonvoice_locale": "bn",
     "population": {
@@ -543,13 +563,15 @@
         "bleu": 0.27514792195783394,
         "chrf": 45.901248962808694,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.42090729431225027
       }
     ],
     "bleu": 0.27514792195783394,
     "chrf": 45.901248962808694,
     "accuracy": 0.5666666666666667,
-    "overall_score": 0.42090729431225027,
     "commonvoice_hours": 177.0,
     "commonvoice_locale": "pt",
     "population": {
@@ -581,13 +603,15 @@
         "bleu": 0.3048037308116852,
         "chrf": 48.4304965568793,
         "accuracy": 0.5333333333333333,
-        "overall_score": 0.41906853207250927
       }
     ],
     "bleu": 0.3048037308116852,
     "chrf": 48.4304965568793,
     "accuracy": 0.5333333333333333,
-    "overall_score": 0.41906853207250927,
     "commonvoice_hours": 2.3,
     "commonvoice_locale": "pa-IN",
     "population": {
@@ -609,13 +633,15 @@
         "bleu": 0.26108507692625094,
         "chrf": 45.063308940468154,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.4138758717964588
       }
     ],
     "bleu": 0.26108507692625094,
     "chrf": 45.063308940468154,
     "accuracy": 0.5666666666666667,
-    "overall_score": 0.4138758717964588,
     "commonvoice_hours": 242.0,
     "commonvoice_locale": "ru",
     "population": {
@@ -654,13 +680,15 @@
         "bleu": 0.2709203338132304,
         "chrf": 44.36399636969686,
         "accuracy": 0.5,
-        "overall_score": 0.3854601669066152
       }
     ],
     "bleu": 0.2709203338132304,
     "chrf": 44.36399636969686,
     "accuracy": 0.5,
-    "overall_score": 0.3854601669066152,
     "commonvoice_hours": 411.0,
     "commonvoice_locale": "sw",
     "population": {
@@ -685,13 +713,15 @@
         "bleu": 0.27441353638286026,
         "chrf": 46.025445629112156,
         "accuracy": 0.6,
-        "overall_score": 0.4372067681914301
       }
     ],
     "bleu": 0.27441353638286026,
     "chrf": 46.025445629112156,
     "accuracy": 0.6,
-    "overall_score": 0.4372067681914301,
     "commonvoice_hours": 33.0,
     "commonvoice_locale": "id",
     "population": {
@@ -709,13 +739,15 @@
         "bleu": 0.3338682761061998,
         "chrf": 50.216731068308064,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.4502674713864332
       }
     ],
     "bleu": 0.3338682761061998,
     "chrf": 50.216731068308064,
     "accuracy": 0.5666666666666667,
-    "overall_score": 0.4502674713864332,
     "commonvoice_hours": 1358.0,
     "commonvoice_locale": "de",
     "population": {
@@ -758,13 +790,15 @@
         "bleu": 0.2940100667664714,
         "chrf": 46.403097021492236,
         "accuracy": 0.6,
-        "overall_score": 0.4470050333832357
       }
     ],
     "bleu": 0.2940100667664714,
     "chrf": 46.403097021492236,
     "accuracy": 0.6,
-    "overall_score": 0.4470050333832357,
     "commonvoice_hours": 222.0,
     "commonvoice_locale": "ja",
     "population": {
@@ -783,13 +817,15 @@
         "bleu": 0.2750887189010237,
         "chrf": 46.31463752811596,
         "accuracy": 0.4,
-        "overall_score": 0.33754435945051187
       }
     ],
     "bleu": 0.2750887189010237,
     "chrf": 46.31463752811596,
     "accuracy": 0.4,
-    "overall_score": 0.33754435945051187,
     "commonvoice_hours": 0.3,
     "commonvoice_locale": "te",
     "population": {
@@ -806,13 +842,15 @@
         "bleu": 0.2584800238292114,
         "chrf": 44.69889855306244,
         "accuracy": 0.5666666666666667,
-        "overall_score": 0.41257334524793904
       }
     ],
     "bleu": 0.2584800238292114,
     "chrf": 44.69889855306244,
     "accuracy": 0.5666666666666667,
-    "overall_score": 0.41257334524793904,
     "commonvoice_hours": 20.0,
     "commonvoice_locale": "mr",
     "population": {
@@ -829,13 +867,15 @@
         "bleu": 0.23082586428104943,
         "chrf": 41.42591471734489,
         "accuracy": 0.4666666666666667,
-        "overall_score": 0.34874626547385806
       }
     ],
     "bleu": 0.23082586428104943,
     "chrf": 41.42591471734489,
     "accuracy": 0.4666666666666667,
-    "overall_score": 0.34874626547385806,
     "commonvoice_hours": 0.0,
     "commonvoice_locale": "jv",
     "population": {
@@ -853,13 +893,15 @@
         "bleu": 0.252552287345529,
         "chrf": 43.351007120897606,
         "accuracy": 0.5333333333333333,
-        "overall_score": 0.3929428103394311
       }
     ],
     "bleu": 0.252552287345529,
     "chrf": 43.351007120897606,
     "accuracy": 0.5333333333333333,
-    "overall_score": 0.3929428103394311,
     "commonvoice_hours": 5.9,
     "commonvoice_locale": "vi",
     "population": {

         "bleu": 0.89404322120213,
         "chrf": 92.53933977489264,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.9778605197038973,
+        "overall_score": 0.8128568025242314
       },
       {
         "model": "meta-llama/llama-3.3-70b-instruct",
         "bleu": 0.4351349353198866,
         "chrf": 54.9504915580248,
         "accuracy": 0.6,
+        "mlm": 0.9681484728467826,
+        "overall_score": 0.6677611360555563
       },
       {
         "model": "mistralai/mistral-small-24b-instruct-2501",
         "bleu": 0.8800468872938262,
         "chrf": 94.30164664106223,
         "accuracy": 0.5333333333333333,
+        "mlm": 0.804094099273989,
+        "overall_score": 0.7391581066337162
       },
       {
         "model": "google/gemini-2.0-flash-001",
         "bleu": 0.8489646963773831,
         "chrf": 92.73129066280984,
         "accuracy": 0.8666666666666667,
+        "mlm": 0.9770616407001859,
+        "overall_score": 0.8975643345814119
       },
       {
         "model": "microsoft/phi-4",
         "bleu": 0.8230104823079876,
         "chrf": 91.69043412576788,
         "accuracy": 0.7,
+        "mlm": 0.9632049588292643,
+        "overall_score": 0.8287384803790839
       }
     ],
     "bleu": 0.7762400445002428,
     "chrf": 85.24264055251147,
     "accuracy": 0.6533333333333333,
+    "mlm": 0.9380739382708239,
+    "overall_score": 0.7892157720348,
     "commonvoice_hours": 2651.0,
     "commonvoice_locale": "en",
     "population": {
         "bleu": 0.3977775857451761,
         "chrf": 57.672913792439125,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.926731451729437,
+        "overall_score": 0.6303919013804266
       }
     ],
     "bleu": 0.3977775857451761,
     "chrf": 57.672913792439125,
     "accuracy": 0.5666666666666667,
+    "mlm": 0.926731451729437,
+    "overall_score": 0.6303919013804266,
     "commonvoice_hours": 422.0,
     "commonvoice_locale": "zh-TW",
     "population": {
         "bleu": 0.333521621016373,
         "chrf": 50.48364584189306,
         "accuracy": 0.5,
+        "mlm": 0.9585976421208252,
+        "overall_score": 0.5973730877123994
       }
     ],
     "bleu": 0.333521621016373,
     "chrf": 50.48364584189306,
     "accuracy": 0.5,
+    "mlm": 0.9585976421208252,
+    "overall_score": 0.5973730877123994,
     "commonvoice_hours": 16.0,
     "commonvoice_locale": "hi-IN",
     "population": {
         "bleu": 0.29160032861883095,
         "chrf": 47.668399832701844,
         "accuracy": 0.5,
+        "mlm": 0.9272973828072317,
+        "overall_score": 0.5729659038086875
       }
     ],
     "bleu": 0.29160032861883095,
     "chrf": 47.668399832701844,
     "accuracy": 0.5,
+    "mlm": 0.9272973828072317,
+    "overall_score": 0.5729659038086875,
     "commonvoice_hours": 446.0,
     "commonvoice_locale": "es",
     "population": {
         "bleu": 0.277257629790728,
         "chrf": 46.62779335380641,
         "accuracy": 0.4666666666666667,
+        "mlm": 0.9617481078420298,
+        "overall_score": 0.5685574680998081
       }
     ],
     "bleu": 0.277257629790728,
     "chrf": 46.62779335380641,
     "accuracy": 0.4666666666666667,
+    "mlm": 0.9617481078420298,
+    "overall_score": 0.5685574680998081,
     "commonvoice_hours": 91.0,
     "commonvoice_locale": "ar",
     "population": {
         "bleu": 0.2659144372728079,
         "chrf": 44.14831240898717,
         "accuracy": 0.43333333333333335,
+        "mlm": 0.9414677321132675,
+        "overall_score": 0.5469051675731363
       }
     ],
     "bleu": 0.2659144372728079,
     "chrf": 44.14831240898717,
     "accuracy": 0.43333333333333335,
+    "mlm": 0.9414677321132675,
+    "overall_score": 0.5469051675731363,
     "commonvoice_hours": 77.0,
     "commonvoice_locale": "ur",
     "population": {
         "bleu": 0.315663773358301,
         "chrf": 49.253978669350964,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.960796739893282,
+        "overall_score": 0.6143757266394165
       }
     ],
     "bleu": 0.315663773358301,
     "chrf": 49.253978669350964,
     "accuracy": 0.5666666666666667,
+    "mlm": 0.960796739893282,
+    "overall_score": 0.6143757266394165,
     "commonvoice_hours": 1052.0,
     "commonvoice_locale": "fr",
     "population": {
         "bleu": 0.21265887286151353,
         "chrf": 41.501657722373686,
         "accuracy": 0.4,
+        "mlm": 0.8995272489886615,
+        "overall_score": 0.504062040616725
       }
     ],
     "bleu": 0.21265887286151353,
     "chrf": 41.501657722373686,
     "accuracy": 0.4,
+    "mlm": 0.8995272489886615,
+    "overall_score": 0.504062040616725,
     "commonvoice_hours": 49.0,
     "commonvoice_locale": "bn",
     "population": {
         "bleu": 0.27514792195783394,
         "chrf": 45.901248962808694,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.9640739007405215,
+        "overall_score": 0.6019628297883407
       }
     ],
     "bleu": 0.27514792195783394,
     "chrf": 45.901248962808694,
     "accuracy": 0.5666666666666667,
+    "mlm": 0.9640739007405215,
+    "overall_score": 0.6019628297883407,
     "commonvoice_hours": 177.0,
     "commonvoice_locale": "pt",
     "population": {
         "bleu": 0.3048037308116852,
         "chrf": 48.4304965568793,
         "accuracy": 0.5333333333333333,
+        "mlm": 0.9033444436966103,
+        "overall_score": 0.5804938359472096
       }
     ],
     "bleu": 0.3048037308116852,
     "chrf": 48.4304965568793,
     "accuracy": 0.5333333333333333,
+    "mlm": 0.9033444436966103,
+    "overall_score": 0.5804938359472096,
     "commonvoice_hours": 2.3,
     "commonvoice_locale": "pa-IN",
     "population": {
         "bleu": 0.26108507692625094,
         "chrf": 45.063308940468154,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.9563400339874765,
+        "overall_score": 0.5946972591934646
       }
     ],
     "bleu": 0.26108507692625094,
     "chrf": 45.063308940468154,
     "accuracy": 0.5666666666666667,
+    "mlm": 0.9563400339874765,
+    "overall_score": 0.5946972591934646,
     "commonvoice_hours": 242.0,
     "commonvoice_locale": "ru",
     "population": {
         "bleu": 0.2709203338132304,
         "chrf": 44.36399636969686,
         "accuracy": 0.5,
+        "mlm": 0.9612351448314987,
+        "overall_score": 0.5773851595482431
       }
     ],
     "bleu": 0.2709203338132304,
     "chrf": 44.36399636969686,
     "accuracy": 0.5,
+    "mlm": 0.9612351448314987,
+    "overall_score": 0.5773851595482431,
     "commonvoice_hours": 411.0,
     "commonvoice_locale": "sw",
     "population": {
         "bleu": 0.27441353638286026,
         "chrf": 46.025445629112156,
         "accuracy": 0.6,
+        "mlm": 0.9465444909745621,
+        "overall_score": 0.6069860091191407
       }
     ],
     "bleu": 0.27441353638286026,
     "chrf": 46.025445629112156,
     "accuracy": 0.6,
+    "mlm": 0.9465444909745621,
+    "overall_score": 0.6069860091191407,
     "commonvoice_hours": 33.0,
     "commonvoice_locale": "id",
     "population": {
         "bleu": 0.3338682761061998,
         "chrf": 50.216731068308064,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.9526738506105953,
+        "overall_score": 0.6177362644611538
       }
     ],
     "bleu": 0.3338682761061998,
     "chrf": 50.216731068308064,
     "accuracy": 0.5666666666666667,
+    "mlm": 0.9526738506105953,
+    "overall_score": 0.6177362644611538,
     "commonvoice_hours": 1358.0,
     "commonvoice_locale": "de",
     "population": {
         "bleu": 0.2940100667664714,
         "chrf": 46.403097021492236,
         "accuracy": 0.6,
+        "mlm": 0.9337910001211718,
+        "overall_score": 0.609267022295881
       }
     ],
     "bleu": 0.2940100667664714,
     "chrf": 46.403097021492236,
     "accuracy": 0.6,
+    "mlm": 0.9337910001211718,
+    "overall_score": 0.609267022295881,
     "commonvoice_hours": 222.0,
     "commonvoice_locale": "ja",
     "population": {
         "bleu": 0.2750887189010237,
         "chrf": 46.31463752811596,
         "accuracy": 0.4,
+        "mlm": 0.9359077032699009,
+        "overall_score": 0.5369988073903081
       }
     ],
     "bleu": 0.2750887189010237,
     "chrf": 46.31463752811596,
     "accuracy": 0.4,
+    "mlm": 0.9359077032699009,
+    "overall_score": 0.5369988073903081,
     "commonvoice_hours": 0.3,
     "commonvoice_locale": "te",
     "population": {
         "bleu": 0.2584800238292114,
         "chrf": 44.69889855306244,
         "accuracy": 0.5666666666666667,
+        "mlm": 0.9351731522339883,
+        "overall_score": 0.5867732809099554
       }
     ],
     "bleu": 0.2584800238292114,
     "chrf": 44.69889855306244,
     "accuracy": 0.5666666666666667,
+    "mlm": 0.9351731522339883,
+    "overall_score": 0.5867732809099554,
     "commonvoice_hours": 20.0,
     "commonvoice_locale": "mr",
     "population": {
         "bleu": 0.23082586428104943,
         "chrf": 41.42591471734489,
         "accuracy": 0.4666666666666667,
+        "mlm": 0.9453687616674971,
+        "overall_score": 0.5476204308717377
       }
     ],
     "bleu": 0.23082586428104943,
     "chrf": 41.42591471734489,
     "accuracy": 0.4666666666666667,
+    "mlm": 0.9453687616674971,
+    "overall_score": 0.5476204308717377,
     "commonvoice_hours": 0.0,
     "commonvoice_locale": "jv",
     "population": {
         "bleu": 0.252552287345529,
         "chrf": 43.351007120897606,
         "accuracy": 0.5333333333333333,
+        "mlm": 0.9638175194388952,
+        "overall_score": 0.5832343800392524
       }
     ],
     "bleu": 0.252552287345529,
     "chrf": 43.351007120897606,
     "accuracy": 0.5333333333333333,
+    "mlm": 0.9638175194388952,
+    "overall_score": 0.5832343800392524,
     "commonvoice_hours": 5.9,
     "commonvoice_locale": "vi",
     "population": {