David Pomerenke commited on
Commit
e92634d
·
1 Parent(s): 1b634f3

Add masked language modeling (MLM) task

Browse files
Files changed (3) hide show
  1. app.py +4 -0
  2. evals.py +72 -2
  3. results.json +84 -42
app.py CHANGED
@@ -187,6 +187,9 @@ def create_language_stats_df(results):
187
  "Classi-fication": round(lang["accuracy"], 3)
188
  if lang["accuracy"] is not None
189
  else "N/A",
 
 
 
190
  "Best Model": model_link,
191
  "CommonVoice Hours": commonvoice_link,
192
  }
@@ -204,6 +207,7 @@ def create_language_stats_df(results):
204
  "number", # Overall
205
  "number", # Translation
206
  "number", # Classification
 
207
  "markdown", # Best Model
208
  "markdown", # CommonVoice Hours
209
  ],
 
187
  "Classi-fication": round(lang["accuracy"], 3)
188
  if lang["accuracy"] is not None
189
  else "N/A",
190
+ "MLM": round(lang["mlm"], 3)
191
+ if lang["mlm"] is not None
192
+ else "N/A",
193
  "Best Model": model_link,
194
  "CommonVoice Hours": commonvoice_link,
195
  }
 
207
  "number", # Overall
208
  "number", # Translation
209
  "number", # Classification
210
+ "number", # MLM
211
  "markdown", # Best Model
212
  "markdown", # CommonVoice Hours
213
  ],
evals.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import json
3
  import os
 
4
  import re
5
  from datetime import date
6
  from os import getenv
@@ -216,6 +217,7 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
216
 
217
  metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
218
 
 
219
  @cache
220
  async def classify_and_evaluate(model, language_bcp_47, nr):
221
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
@@ -238,8 +240,10 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
238
  frac=1, random_state=42
239
  )
240
  test_paragraph = test_paragraphs.iloc[nr]
 
241
  def topic_to_number(topic):
242
  return top_topics.get_loc(topic)
 
243
  messages = []
244
  for example in examples.itertuples():
245
  messages += [
@@ -271,6 +275,52 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
271
  }
272
 
273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
  def mean(lst):
275
  return sum(lst) / len(lst) if lst else 0
276
 
@@ -302,6 +352,16 @@ async def main():
302
  classification_scores = await tqdm_asyncio.gather(
303
  *classification_scores, miniters=1
304
  )
 
 
 
 
 
 
 
 
 
 
305
  results = []
306
  for language in languages.itertuples():
307
  results_for_language = []
@@ -316,10 +376,16 @@ async def main():
316
  for score in classification_scores
317
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
318
  ]
 
 
 
 
 
319
  bleu = mean([s["bleu"] for s in translations_for_model])
320
  chrf = mean([s["chrf"] for s in translations_for_model])
321
  accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
322
- overall_score = (bleu + accuracy) / 2
 
323
  if translations_for_model:
324
  results_for_language.append(
325
  {
@@ -327,6 +393,7 @@ async def main():
327
  "bleu": bleu,
328
  "chrf": chrf,
329
  "accuracy": accuracy,
 
330
  "overall_score": overall_score,
331
  }
332
  )
@@ -340,7 +407,10 @@ async def main():
340
  "bleu": mean([s["bleu"] for s in results_for_language]),
341
  "chrf": mean([s["chrf"] for s in results_for_language]),
342
  "accuracy": mean([s["accuracy"] for s in results_for_language]),
343
- "overall_score": mean([s["overall_score"] for s in results_for_language]),
 
 
 
344
  "commonvoice_hours": language.commonvoice_hours
345
  if not pd.isna(language.commonvoice_hours)
346
  else None,
 
1
  import asyncio
2
  import json
3
  import os
4
+ import random
5
  import re
6
  from datetime import date
7
  from os import getenv
 
217
 
218
  metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
219
 
220
+
221
  @cache
222
  async def classify_and_evaluate(model, language_bcp_47, nr):
223
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
 
240
  frac=1, random_state=42
241
  )
242
  test_paragraph = test_paragraphs.iloc[nr]
243
+
244
  def topic_to_number(topic):
245
  return top_topics.get_loc(topic)
246
+
247
  messages = []
248
  for example in examples.itertuples():
249
  messages += [
 
275
  }
276
 
277
 
278
+ def corrupt_sentence(sentence):
279
+ # replace 5% of the sentence with <mask>
280
+ mask_length = round(len(sentence) * 0.05)
281
+ start = random.randint(0, len(sentence) - mask_length)
282
+ end = start + mask_length
283
+ return sentence[:start] + "<mask>" + sentence[end:]
284
+
285
+
286
+ @cache
287
+ async def mlm_and_evaluate(model, language_bcp_47, nr):
288
+ language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
289
+ sentences = pd.DataFrame(load_sentences(language), columns=["text"])
290
+ sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
291
+ examples = sentences.sample(n=10, random_state=42)
292
+ test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
293
+ frac=1, random_state=42
294
+ )
295
+ test_sentence = test_sentences.iloc[nr]
296
+ messages = []
297
+ for example in examples.itertuples():
298
+ messages += [
299
+ {"role": "user", "content": example.corrupt_text},
300
+ {"role": "assistant", "content": example.text},
301
+ ]
302
+ reply = await complete(
303
+ model=model,
304
+ messages=[
305
+ *messages,
306
+ {
307
+ "role": "user",
308
+ "content": test_sentence.corrupt_text,
309
+ },
310
+ ],
311
+ temperature=0,
312
+ max_tokens=1024,
313
+ )
314
+ prediction = reply.choices[0].message.content.strip()
315
+ chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
316
+ return {
317
+ "model": model,
318
+ "bcp_47": language["bcp_47"],
319
+ "chrf": chrf_score["score"],
320
+ "sentence_nr": nr,
321
+ }
322
+
323
+
324
  def mean(lst):
325
  return sum(lst) / len(lst) if lst else 0
326
 
 
352
  classification_scores = await tqdm_asyncio.gather(
353
  *classification_scores, miniters=1
354
  )
355
+ print("evaluate mlm")
356
+ mlm_scores = [
357
+ mlm_and_evaluate(model, language.bcp_47, i)
358
+ for i in range(n_sentences)
359
+ for language in languages.itertuples()
360
+ for model in models
361
+ if language.in_benchmark
362
+ and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
363
+ ]
364
+ mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
365
  results = []
366
  for language in languages.itertuples():
367
  results_for_language = []
 
376
  for score in classification_scores
377
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
378
  ]
379
+ mlm_for_model = [
380
+ score
381
+ for score in mlm_scores
382
+ if score["bcp_47"] == language.bcp_47 and score["model"] == model
383
+ ]
384
  bleu = mean([s["bleu"] for s in translations_for_model])
385
  chrf = mean([s["chrf"] for s in translations_for_model])
386
  accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
387
+ mlm = mean([s["chrf"] for s in mlm_for_model]) / 100
388
+ overall_score = (bleu + accuracy + mlm) / 3
389
  if translations_for_model:
390
  results_for_language.append(
391
  {
 
393
  "bleu": bleu,
394
  "chrf": chrf,
395
  "accuracy": accuracy,
396
+ "mlm": mlm,
397
  "overall_score": overall_score,
398
  }
399
  )
 
407
  "bleu": mean([s["bleu"] for s in results_for_language]),
408
  "chrf": mean([s["chrf"] for s in results_for_language]),
409
  "accuracy": mean([s["accuracy"] for s in results_for_language]),
410
+ "mlm": mean([s["mlm"] for s in results_for_language]),
411
+ "overall_score": mean(
412
+ [s["overall_score"] for s in results_for_language]
413
+ ),
414
  "commonvoice_hours": language.commonvoice_hours
415
  if not pd.isna(language.commonvoice_hours)
416
  else None,
results.json CHANGED
@@ -9,41 +9,47 @@
9
  "bleu": 0.89404322120213,
10
  "chrf": 92.53933977489264,
11
  "accuracy": 0.5666666666666667,
12
- "overall_score": 0.7303549439343984
 
13
  },
14
  {
15
  "model": "meta-llama/llama-3.3-70b-instruct",
16
  "bleu": 0.4351349353198866,
17
  "chrf": 54.9504915580248,
18
  "accuracy": 0.6,
19
- "overall_score": 0.5175674676599433
 
20
  },
21
  {
22
  "model": "mistralai/mistral-small-24b-instruct-2501",
23
  "bleu": 0.8800468872938262,
24
  "chrf": 94.30164664106223,
25
  "accuracy": 0.5333333333333333,
26
- "overall_score": 0.7066901103135798
 
27
  },
28
  {
29
  "model": "google/gemini-2.0-flash-001",
30
  "bleu": 0.8489646963773831,
31
  "chrf": 92.73129066280984,
32
  "accuracy": 0.8666666666666667,
33
- "overall_score": 0.8578156815220249
 
34
  },
35
  {
36
  "model": "microsoft/phi-4",
37
  "bleu": 0.8230104823079876,
38
  "chrf": 91.69043412576788,
39
  "accuracy": 0.7,
40
- "overall_score": 0.7615052411539938
 
41
  }
42
  ],
43
  "bleu": 0.7762400445002428,
44
  "chrf": 85.24264055251147,
45
  "accuracy": 0.6533333333333333,
46
- "overall_score": 0.714786688916788,
 
47
  "commonvoice_hours": 2651.0,
48
  "commonvoice_locale": "en",
49
  "population": {
@@ -214,13 +220,15 @@
214
  "bleu": 0.3977775857451761,
215
  "chrf": 57.672913792439125,
216
  "accuracy": 0.5666666666666667,
217
- "overall_score": 0.48222212620592136
 
218
  }
219
  ],
220
  "bleu": 0.3977775857451761,
221
  "chrf": 57.672913792439125,
222
  "accuracy": 0.5666666666666667,
223
- "overall_score": 0.48222212620592136,
 
224
  "commonvoice_hours": 422.0,
225
  "commonvoice_locale": "zh-TW",
226
  "population": {
@@ -256,13 +264,15 @@
256
  "bleu": 0.333521621016373,
257
  "chrf": 50.48364584189306,
258
  "accuracy": 0.5,
259
- "overall_score": 0.4167608105081865
 
260
  }
261
  ],
262
  "bleu": 0.333521621016373,
263
  "chrf": 50.48364584189306,
264
  "accuracy": 0.5,
265
- "overall_score": 0.4167608105081865,
 
266
  "commonvoice_hours": 16.0,
267
  "commonvoice_locale": "hi-IN",
268
  "population": {
@@ -284,13 +294,15 @@
284
  "bleu": 0.29160032861883095,
285
  "chrf": 47.668399832701844,
286
  "accuracy": 0.5,
287
- "overall_score": 0.39580016430941545
 
288
  }
289
  ],
290
  "bleu": 0.29160032861883095,
291
  "chrf": 47.668399832701844,
292
  "accuracy": 0.5,
293
- "overall_score": 0.39580016430941545,
 
294
  "commonvoice_hours": 446.0,
295
  "commonvoice_locale": "es",
296
  "population": {
@@ -345,13 +357,15 @@
345
  "bleu": 0.277257629790728,
346
  "chrf": 46.62779335380641,
347
  "accuracy": 0.4666666666666667,
348
- "overall_score": 0.37196214822869733
 
349
  }
350
  ],
351
  "bleu": 0.277257629790728,
352
  "chrf": 46.62779335380641,
353
  "accuracy": 0.4666666666666667,
354
- "overall_score": 0.37196214822869733,
 
355
  "commonvoice_hours": 91.0,
356
  "commonvoice_locale": "ar",
357
  "population": {
@@ -405,13 +419,15 @@
405
  "bleu": 0.2659144372728079,
406
  "chrf": 44.14831240898717,
407
  "accuracy": 0.43333333333333335,
408
- "overall_score": 0.34962388530307065
 
409
  }
410
  ],
411
  "bleu": 0.2659144372728079,
412
  "chrf": 44.14831240898717,
413
  "accuracy": 0.43333333333333335,
414
- "overall_score": 0.34962388530307065,
 
415
  "commonvoice_hours": 77.0,
416
  "commonvoice_locale": "ur",
417
  "population": {
@@ -432,13 +448,15 @@
432
  "bleu": 0.315663773358301,
433
  "chrf": 49.253978669350964,
434
  "accuracy": 0.5666666666666667,
435
- "overall_score": 0.4411652200124838
 
436
  }
437
  ],
438
  "bleu": 0.315663773358301,
439
  "chrf": 49.253978669350964,
440
  "accuracy": 0.5666666666666667,
441
- "overall_score": 0.4411652200124838,
 
442
  "commonvoice_hours": 1052.0,
443
  "commonvoice_locale": "fr",
444
  "population": {
@@ -516,13 +534,15 @@
516
  "bleu": 0.21265887286151353,
517
  "chrf": 41.501657722373686,
518
  "accuracy": 0.4,
519
- "overall_score": 0.3063294364307568
 
520
  }
521
  ],
522
  "bleu": 0.21265887286151353,
523
  "chrf": 41.501657722373686,
524
  "accuracy": 0.4,
525
- "overall_score": 0.3063294364307568,
 
526
  "commonvoice_hours": 49.0,
527
  "commonvoice_locale": "bn",
528
  "population": {
@@ -543,13 +563,15 @@
543
  "bleu": 0.27514792195783394,
544
  "chrf": 45.901248962808694,
545
  "accuracy": 0.5666666666666667,
546
- "overall_score": 0.42090729431225027
 
547
  }
548
  ],
549
  "bleu": 0.27514792195783394,
550
  "chrf": 45.901248962808694,
551
  "accuracy": 0.5666666666666667,
552
- "overall_score": 0.42090729431225027,
 
553
  "commonvoice_hours": 177.0,
554
  "commonvoice_locale": "pt",
555
  "population": {
@@ -581,13 +603,15 @@
581
  "bleu": 0.3048037308116852,
582
  "chrf": 48.4304965568793,
583
  "accuracy": 0.5333333333333333,
584
- "overall_score": 0.41906853207250927
 
585
  }
586
  ],
587
  "bleu": 0.3048037308116852,
588
  "chrf": 48.4304965568793,
589
  "accuracy": 0.5333333333333333,
590
- "overall_score": 0.41906853207250927,
 
591
  "commonvoice_hours": 2.3,
592
  "commonvoice_locale": "pa-IN",
593
  "population": {
@@ -609,13 +633,15 @@
609
  "bleu": 0.26108507692625094,
610
  "chrf": 45.063308940468154,
611
  "accuracy": 0.5666666666666667,
612
- "overall_score": 0.4138758717964588
 
613
  }
614
  ],
615
  "bleu": 0.26108507692625094,
616
  "chrf": 45.063308940468154,
617
  "accuracy": 0.5666666666666667,
618
- "overall_score": 0.4138758717964588,
 
619
  "commonvoice_hours": 242.0,
620
  "commonvoice_locale": "ru",
621
  "population": {
@@ -654,13 +680,15 @@
654
  "bleu": 0.2709203338132304,
655
  "chrf": 44.36399636969686,
656
  "accuracy": 0.5,
657
- "overall_score": 0.3854601669066152
 
658
  }
659
  ],
660
  "bleu": 0.2709203338132304,
661
  "chrf": 44.36399636969686,
662
  "accuracy": 0.5,
663
- "overall_score": 0.3854601669066152,
 
664
  "commonvoice_hours": 411.0,
665
  "commonvoice_locale": "sw",
666
  "population": {
@@ -685,13 +713,15 @@
685
  "bleu": 0.27441353638286026,
686
  "chrf": 46.025445629112156,
687
  "accuracy": 0.6,
688
- "overall_score": 0.4372067681914301
 
689
  }
690
  ],
691
  "bleu": 0.27441353638286026,
692
  "chrf": 46.025445629112156,
693
  "accuracy": 0.6,
694
- "overall_score": 0.4372067681914301,
 
695
  "commonvoice_hours": 33.0,
696
  "commonvoice_locale": "id",
697
  "population": {
@@ -709,13 +739,15 @@
709
  "bleu": 0.3338682761061998,
710
  "chrf": 50.216731068308064,
711
  "accuracy": 0.5666666666666667,
712
- "overall_score": 0.4502674713864332
 
713
  }
714
  ],
715
  "bleu": 0.3338682761061998,
716
  "chrf": 50.216731068308064,
717
  "accuracy": 0.5666666666666667,
718
- "overall_score": 0.4502674713864332,
 
719
  "commonvoice_hours": 1358.0,
720
  "commonvoice_locale": "de",
721
  "population": {
@@ -758,13 +790,15 @@
758
  "bleu": 0.2940100667664714,
759
  "chrf": 46.403097021492236,
760
  "accuracy": 0.6,
761
- "overall_score": 0.4470050333832357
 
762
  }
763
  ],
764
  "bleu": 0.2940100667664714,
765
  "chrf": 46.403097021492236,
766
  "accuracy": 0.6,
767
- "overall_score": 0.4470050333832357,
 
768
  "commonvoice_hours": 222.0,
769
  "commonvoice_locale": "ja",
770
  "population": {
@@ -783,13 +817,15 @@
783
  "bleu": 0.2750887189010237,
784
  "chrf": 46.31463752811596,
785
  "accuracy": 0.4,
786
- "overall_score": 0.33754435945051187
 
787
  }
788
  ],
789
  "bleu": 0.2750887189010237,
790
  "chrf": 46.31463752811596,
791
  "accuracy": 0.4,
792
- "overall_score": 0.33754435945051187,
 
793
  "commonvoice_hours": 0.3,
794
  "commonvoice_locale": "te",
795
  "population": {
@@ -806,13 +842,15 @@
806
  "bleu": 0.2584800238292114,
807
  "chrf": 44.69889855306244,
808
  "accuracy": 0.5666666666666667,
809
- "overall_score": 0.41257334524793904
 
810
  }
811
  ],
812
  "bleu": 0.2584800238292114,
813
  "chrf": 44.69889855306244,
814
  "accuracy": 0.5666666666666667,
815
- "overall_score": 0.41257334524793904,
 
816
  "commonvoice_hours": 20.0,
817
  "commonvoice_locale": "mr",
818
  "population": {
@@ -829,13 +867,15 @@
829
  "bleu": 0.23082586428104943,
830
  "chrf": 41.42591471734489,
831
  "accuracy": 0.4666666666666667,
832
- "overall_score": 0.34874626547385806
 
833
  }
834
  ],
835
  "bleu": 0.23082586428104943,
836
  "chrf": 41.42591471734489,
837
  "accuracy": 0.4666666666666667,
838
- "overall_score": 0.34874626547385806,
 
839
  "commonvoice_hours": 0.0,
840
  "commonvoice_locale": "jv",
841
  "population": {
@@ -853,13 +893,15 @@
853
  "bleu": 0.252552287345529,
854
  "chrf": 43.351007120897606,
855
  "accuracy": 0.5333333333333333,
856
- "overall_score": 0.3929428103394311
 
857
  }
858
  ],
859
  "bleu": 0.252552287345529,
860
  "chrf": 43.351007120897606,
861
  "accuracy": 0.5333333333333333,
862
- "overall_score": 0.3929428103394311,
 
863
  "commonvoice_hours": 5.9,
864
  "commonvoice_locale": "vi",
865
  "population": {
 
9
  "bleu": 0.89404322120213,
10
  "chrf": 92.53933977489264,
11
  "accuracy": 0.5666666666666667,
12
+ "mlm": 0.9778605197038973,
13
+ "overall_score": 0.8128568025242314
14
  },
15
  {
16
  "model": "meta-llama/llama-3.3-70b-instruct",
17
  "bleu": 0.4351349353198866,
18
  "chrf": 54.9504915580248,
19
  "accuracy": 0.6,
20
+ "mlm": 0.9681484728467826,
21
+ "overall_score": 0.6677611360555563
22
  },
23
  {
24
  "model": "mistralai/mistral-small-24b-instruct-2501",
25
  "bleu": 0.8800468872938262,
26
  "chrf": 94.30164664106223,
27
  "accuracy": 0.5333333333333333,
28
+ "mlm": 0.804094099273989,
29
+ "overall_score": 0.7391581066337162
30
  },
31
  {
32
  "model": "google/gemini-2.0-flash-001",
33
  "bleu": 0.8489646963773831,
34
  "chrf": 92.73129066280984,
35
  "accuracy": 0.8666666666666667,
36
+ "mlm": 0.9770616407001859,
37
+ "overall_score": 0.8975643345814119
38
  },
39
  {
40
  "model": "microsoft/phi-4",
41
  "bleu": 0.8230104823079876,
42
  "chrf": 91.69043412576788,
43
  "accuracy": 0.7,
44
+ "mlm": 0.9632049588292643,
45
+ "overall_score": 0.8287384803790839
46
  }
47
  ],
48
  "bleu": 0.7762400445002428,
49
  "chrf": 85.24264055251147,
50
  "accuracy": 0.6533333333333333,
51
+ "mlm": 0.9380739382708239,
52
+ "overall_score": 0.7892157720348,
53
  "commonvoice_hours": 2651.0,
54
  "commonvoice_locale": "en",
55
  "population": {
 
220
  "bleu": 0.3977775857451761,
221
  "chrf": 57.672913792439125,
222
  "accuracy": 0.5666666666666667,
223
+ "mlm": 0.926731451729437,
224
+ "overall_score": 0.6303919013804266
225
  }
226
  ],
227
  "bleu": 0.3977775857451761,
228
  "chrf": 57.672913792439125,
229
  "accuracy": 0.5666666666666667,
230
+ "mlm": 0.926731451729437,
231
+ "overall_score": 0.6303919013804266,
232
  "commonvoice_hours": 422.0,
233
  "commonvoice_locale": "zh-TW",
234
  "population": {
 
264
  "bleu": 0.333521621016373,
265
  "chrf": 50.48364584189306,
266
  "accuracy": 0.5,
267
+ "mlm": 0.9585976421208252,
268
+ "overall_score": 0.5973730877123994
269
  }
270
  ],
271
  "bleu": 0.333521621016373,
272
  "chrf": 50.48364584189306,
273
  "accuracy": 0.5,
274
+ "mlm": 0.9585976421208252,
275
+ "overall_score": 0.5973730877123994,
276
  "commonvoice_hours": 16.0,
277
  "commonvoice_locale": "hi-IN",
278
  "population": {
 
294
  "bleu": 0.29160032861883095,
295
  "chrf": 47.668399832701844,
296
  "accuracy": 0.5,
297
+ "mlm": 0.9272973828072317,
298
+ "overall_score": 0.5729659038086875
299
  }
300
  ],
301
  "bleu": 0.29160032861883095,
302
  "chrf": 47.668399832701844,
303
  "accuracy": 0.5,
304
+ "mlm": 0.9272973828072317,
305
+ "overall_score": 0.5729659038086875,
306
  "commonvoice_hours": 446.0,
307
  "commonvoice_locale": "es",
308
  "population": {
 
357
  "bleu": 0.277257629790728,
358
  "chrf": 46.62779335380641,
359
  "accuracy": 0.4666666666666667,
360
+ "mlm": 0.9617481078420298,
361
+ "overall_score": 0.5685574680998081
362
  }
363
  ],
364
  "bleu": 0.277257629790728,
365
  "chrf": 46.62779335380641,
366
  "accuracy": 0.4666666666666667,
367
+ "mlm": 0.9617481078420298,
368
+ "overall_score": 0.5685574680998081,
369
  "commonvoice_hours": 91.0,
370
  "commonvoice_locale": "ar",
371
  "population": {
 
419
  "bleu": 0.2659144372728079,
420
  "chrf": 44.14831240898717,
421
  "accuracy": 0.43333333333333335,
422
+ "mlm": 0.9414677321132675,
423
+ "overall_score": 0.5469051675731363
424
  }
425
  ],
426
  "bleu": 0.2659144372728079,
427
  "chrf": 44.14831240898717,
428
  "accuracy": 0.43333333333333335,
429
+ "mlm": 0.9414677321132675,
430
+ "overall_score": 0.5469051675731363,
431
  "commonvoice_hours": 77.0,
432
  "commonvoice_locale": "ur",
433
  "population": {
 
448
  "bleu": 0.315663773358301,
449
  "chrf": 49.253978669350964,
450
  "accuracy": 0.5666666666666667,
451
+ "mlm": 0.960796739893282,
452
+ "overall_score": 0.6143757266394165
453
  }
454
  ],
455
  "bleu": 0.315663773358301,
456
  "chrf": 49.253978669350964,
457
  "accuracy": 0.5666666666666667,
458
+ "mlm": 0.960796739893282,
459
+ "overall_score": 0.6143757266394165,
460
  "commonvoice_hours": 1052.0,
461
  "commonvoice_locale": "fr",
462
  "population": {
 
534
  "bleu": 0.21265887286151353,
535
  "chrf": 41.501657722373686,
536
  "accuracy": 0.4,
537
+ "mlm": 0.8995272489886615,
538
+ "overall_score": 0.504062040616725
539
  }
540
  ],
541
  "bleu": 0.21265887286151353,
542
  "chrf": 41.501657722373686,
543
  "accuracy": 0.4,
544
+ "mlm": 0.8995272489886615,
545
+ "overall_score": 0.504062040616725,
546
  "commonvoice_hours": 49.0,
547
  "commonvoice_locale": "bn",
548
  "population": {
 
563
  "bleu": 0.27514792195783394,
564
  "chrf": 45.901248962808694,
565
  "accuracy": 0.5666666666666667,
566
+ "mlm": 0.9640739007405215,
567
+ "overall_score": 0.6019628297883407
568
  }
569
  ],
570
  "bleu": 0.27514792195783394,
571
  "chrf": 45.901248962808694,
572
  "accuracy": 0.5666666666666667,
573
+ "mlm": 0.9640739007405215,
574
+ "overall_score": 0.6019628297883407,
575
  "commonvoice_hours": 177.0,
576
  "commonvoice_locale": "pt",
577
  "population": {
 
603
  "bleu": 0.3048037308116852,
604
  "chrf": 48.4304965568793,
605
  "accuracy": 0.5333333333333333,
606
+ "mlm": 0.9033444436966103,
607
+ "overall_score": 0.5804938359472096
608
  }
609
  ],
610
  "bleu": 0.3048037308116852,
611
  "chrf": 48.4304965568793,
612
  "accuracy": 0.5333333333333333,
613
+ "mlm": 0.9033444436966103,
614
+ "overall_score": 0.5804938359472096,
615
  "commonvoice_hours": 2.3,
616
  "commonvoice_locale": "pa-IN",
617
  "population": {
 
633
  "bleu": 0.26108507692625094,
634
  "chrf": 45.063308940468154,
635
  "accuracy": 0.5666666666666667,
636
+ "mlm": 0.9563400339874765,
637
+ "overall_score": 0.5946972591934646
638
  }
639
  ],
640
  "bleu": 0.26108507692625094,
641
  "chrf": 45.063308940468154,
642
  "accuracy": 0.5666666666666667,
643
+ "mlm": 0.9563400339874765,
644
+ "overall_score": 0.5946972591934646,
645
  "commonvoice_hours": 242.0,
646
  "commonvoice_locale": "ru",
647
  "population": {
 
680
  "bleu": 0.2709203338132304,
681
  "chrf": 44.36399636969686,
682
  "accuracy": 0.5,
683
+ "mlm": 0.9612351448314987,
684
+ "overall_score": 0.5773851595482431
685
  }
686
  ],
687
  "bleu": 0.2709203338132304,
688
  "chrf": 44.36399636969686,
689
  "accuracy": 0.5,
690
+ "mlm": 0.9612351448314987,
691
+ "overall_score": 0.5773851595482431,
692
  "commonvoice_hours": 411.0,
693
  "commonvoice_locale": "sw",
694
  "population": {
 
713
  "bleu": 0.27441353638286026,
714
  "chrf": 46.025445629112156,
715
  "accuracy": 0.6,
716
+ "mlm": 0.9465444909745621,
717
+ "overall_score": 0.6069860091191407
718
  }
719
  ],
720
  "bleu": 0.27441353638286026,
721
  "chrf": 46.025445629112156,
722
  "accuracy": 0.6,
723
+ "mlm": 0.9465444909745621,
724
+ "overall_score": 0.6069860091191407,
725
  "commonvoice_hours": 33.0,
726
  "commonvoice_locale": "id",
727
  "population": {
 
739
  "bleu": 0.3338682761061998,
740
  "chrf": 50.216731068308064,
741
  "accuracy": 0.5666666666666667,
742
+ "mlm": 0.9526738506105953,
743
+ "overall_score": 0.6177362644611538
744
  }
745
  ],
746
  "bleu": 0.3338682761061998,
747
  "chrf": 50.216731068308064,
748
  "accuracy": 0.5666666666666667,
749
+ "mlm": 0.9526738506105953,
750
+ "overall_score": 0.6177362644611538,
751
  "commonvoice_hours": 1358.0,
752
  "commonvoice_locale": "de",
753
  "population": {
 
790
  "bleu": 0.2940100667664714,
791
  "chrf": 46.403097021492236,
792
  "accuracy": 0.6,
793
+ "mlm": 0.9337910001211718,
794
+ "overall_score": 0.609267022295881
795
  }
796
  ],
797
  "bleu": 0.2940100667664714,
798
  "chrf": 46.403097021492236,
799
  "accuracy": 0.6,
800
+ "mlm": 0.9337910001211718,
801
+ "overall_score": 0.609267022295881,
802
  "commonvoice_hours": 222.0,
803
  "commonvoice_locale": "ja",
804
  "population": {
 
817
  "bleu": 0.2750887189010237,
818
  "chrf": 46.31463752811596,
819
  "accuracy": 0.4,
820
+ "mlm": 0.9359077032699009,
821
+ "overall_score": 0.5369988073903081
822
  }
823
  ],
824
  "bleu": 0.2750887189010237,
825
  "chrf": 46.31463752811596,
826
  "accuracy": 0.4,
827
+ "mlm": 0.9359077032699009,
828
+ "overall_score": 0.5369988073903081,
829
  "commonvoice_hours": 0.3,
830
  "commonvoice_locale": "te",
831
  "population": {
 
842
  "bleu": 0.2584800238292114,
843
  "chrf": 44.69889855306244,
844
  "accuracy": 0.5666666666666667,
845
+ "mlm": 0.9351731522339883,
846
+ "overall_score": 0.5867732809099554
847
  }
848
  ],
849
  "bleu": 0.2584800238292114,
850
  "chrf": 44.69889855306244,
851
  "accuracy": 0.5666666666666667,
852
+ "mlm": 0.9351731522339883,
853
+ "overall_score": 0.5867732809099554,
854
  "commonvoice_hours": 20.0,
855
  "commonvoice_locale": "mr",
856
  "population": {
 
867
  "bleu": 0.23082586428104943,
868
  "chrf": 41.42591471734489,
869
  "accuracy": 0.4666666666666667,
870
+ "mlm": 0.9453687616674971,
871
+ "overall_score": 0.5476204308717377
872
  }
873
  ],
874
  "bleu": 0.23082586428104943,
875
  "chrf": 41.42591471734489,
876
  "accuracy": 0.4666666666666667,
877
+ "mlm": 0.9453687616674971,
878
+ "overall_score": 0.5476204308717377,
879
  "commonvoice_hours": 0.0,
880
  "commonvoice_locale": "jv",
881
  "population": {
 
893
  "bleu": 0.252552287345529,
894
  "chrf": 43.351007120897606,
895
  "accuracy": 0.5333333333333333,
896
+ "mlm": 0.9638175194388952,
897
+ "overall_score": 0.5832343800392524
898
  }
899
  ],
900
  "bleu": 0.252552287345529,
901
  "chrf": 43.351007120897606,
902
  "accuracy": 0.5333333333333333,
903
+ "mlm": 0.9638175194388952,
904
+ "overall_score": 0.5832343800392524,
905
  "commonvoice_hours": 5.9,
906
  "commonvoice_locale": "vi",
907
  "population": {