David Pomerenke
commited on
Commit
·
8633921
1
Parent(s):
573d32f
More evals
Browse files
app.py
CHANGED
@@ -286,7 +286,7 @@ def create_language_stats_df(metric):
|
|
286 |
|
287 |
df = pd.DataFrame(flat_data)
|
288 |
return gr.DataFrame(
|
289 |
-
value=df,
|
290 |
label="Language Results",
|
291 |
show_search="search",
|
292 |
datatype=[
|
@@ -297,6 +297,7 @@ def create_language_stats_df(metric):
|
|
297 |
"number", # Translation
|
298 |
"number", # Classification
|
299 |
"number", # MLM
|
|
|
300 |
"markdown", # Best Model
|
301 |
"markdown", # CommonVoice Hours
|
302 |
],
|
|
|
286 |
|
287 |
df = pd.DataFrame(flat_data)
|
288 |
return gr.DataFrame(
|
289 |
+
value=df,
|
290 |
label="Language Results",
|
291 |
show_search="search",
|
292 |
datatype=[
|
|
|
297 |
"number", # Translation
|
298 |
"number", # Classification
|
299 |
"number", # MLM
|
300 |
+
"number", # ASR
|
301 |
"markdown", # Best Model
|
302 |
"markdown", # CommonVoice Hours
|
303 |
],
|
evals.py
CHANGED
@@ -42,11 +42,11 @@ models = [
|
|
42 |
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
43 |
transcription_models = [
|
44 |
"elevenlabs/scribe_v1",
|
45 |
-
"openai/whisper-large-v3
|
46 |
# "openai/whisper-small",
|
47 |
# "facebook/seamless-m4t-v2-large",
|
48 |
]
|
49 |
-
transcription_model_fast = "
|
50 |
n_sentences = 30
|
51 |
|
52 |
# ===== setup =====
|
@@ -203,14 +203,14 @@ languages = pd.merge(
|
|
203 |
) # "left" because keep it simple for now
|
204 |
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
205 |
|
206 |
-
languages = languages.sort_values(by="speakers", ascending=False).iloc[:
|
207 |
|
208 |
# sample languages to translate to
|
209 |
target_languages = languages[languages["in_benchmark"]].sample(
|
210 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
211 |
)
|
212 |
# sample languages to analyze with all models
|
213 |
-
detailed_languages = languages[languages["in_benchmark"]].iloc[:
|
214 |
|
215 |
|
216 |
# ===== define tasks and metrics =====
|
|
|
42 |
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
43 |
transcription_models = [
|
44 |
"elevenlabs/scribe_v1",
|
45 |
+
"openai/whisper-large-v3",
|
46 |
# "openai/whisper-small",
|
47 |
# "facebook/seamless-m4t-v2-large",
|
48 |
]
|
49 |
+
transcription_model_fast = "elevenlabs/scribe_v1"
|
50 |
n_sentences = 30
|
51 |
|
52 |
# ===== setup =====
|
|
|
203 |
) # "left" because keep it simple for now
|
204 |
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
205 |
|
206 |
+
languages = languages.sort_values(by="speakers", ascending=False).iloc[:10]
|
207 |
|
208 |
# sample languages to translate to
|
209 |
target_languages = languages[languages["in_benchmark"]].sample(
|
210 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
211 |
)
|
212 |
# sample languages to analyze with all models
|
213 |
+
detailed_languages = languages[languages["in_benchmark"]].iloc[:5]
|
214 |
|
215 |
|
216 |
# ===== define tasks and metrics =====
|
results.json
CHANGED
@@ -56,10 +56,10 @@
|
|
56 |
"overall_score": 0.34916319968417603
|
57 |
},
|
58 |
{
|
59 |
-
"model": "openai/whisper-large-v3
|
60 |
"model_type": "speech-to-text",
|
61 |
-
"asr_wer": 0.
|
62 |
-
"overall_score": 0.
|
63 |
}
|
64 |
],
|
65 |
"commonvoice_hours": 2651.0,
|
@@ -226,8 +226,8 @@
|
|
226 |
"mt_chrf": 65.91917919929946,
|
227 |
"cls_acc": 0.6533333333333333,
|
228 |
"mlm_chrf": 93.48244773503015,
|
229 |
-
"asr_wer": 0.
|
230 |
-
"overall_score": 0.
|
231 |
},
|
232 |
{
|
233 |
"language_name": "Chinese",
|
@@ -286,7 +286,7 @@
|
|
286 |
"overall_score": 1.0
|
287 |
},
|
288 |
{
|
289 |
-
"model": "openai/whisper-large-v3
|
290 |
"model_type": "speech-to-text",
|
291 |
"asr_wer": 1.0,
|
292 |
"overall_score": 1.0
|
@@ -329,6 +329,15 @@
|
|
329 |
"bcp_47": "hi",
|
330 |
"speakers": 546882144,
|
331 |
"scores": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
{
|
333 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
334 |
"model_type": "text-to-text",
|
@@ -339,10 +348,43 @@
|
|
339 |
"overall_score": 0.6657108425749162
|
340 |
},
|
341 |
{
|
342 |
-
"model": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
"model_type": "speech-to-text",
|
344 |
-
"asr_wer": 0.
|
345 |
-
"overall_score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
}
|
347 |
],
|
348 |
"commonvoice_hours": 16.0,
|
@@ -356,18 +398,27 @@
|
|
356 |
"ZA": 1129272
|
357 |
},
|
358 |
"language_family": "Indo-European",
|
359 |
-
"mt_bleu": 0.
|
360 |
-
"mt_chrf":
|
361 |
-
"cls_acc": 0.
|
362 |
-
"mlm_chrf":
|
363 |
-
"asr_wer": 0.
|
364 |
-
"overall_score": 0.
|
365 |
},
|
366 |
{
|
367 |
"language_name": "Spanish",
|
368 |
"bcp_47": "es",
|
369 |
"speakers": 493528077,
|
370 |
"scores": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
{
|
372 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
373 |
"model_type": "text-to-text",
|
@@ -378,10 +429,43 @@
|
|
378 |
"overall_score": 0.6635684659512185
|
379 |
},
|
380 |
{
|
381 |
-
"model": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
"model_type": "speech-to-text",
|
383 |
-
"asr_wer": 0.
|
384 |
-
"overall_score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
}
|
386 |
],
|
387 |
"commonvoice_hours": 446.0,
|
@@ -428,18 +512,27 @@
|
|
428 |
"VE": 23488572
|
429 |
},
|
430 |
"language_family": "Indo-European",
|
431 |
-
"mt_bleu": 0.
|
432 |
-
"mt_chrf":
|
433 |
-
"cls_acc": 0.
|
434 |
-
"mlm_chrf":
|
435 |
-
"asr_wer": 0.
|
436 |
-
"overall_score": 0.
|
437 |
},
|
438 |
{
|
439 |
"language_name": "Arabic",
|
440 |
"bcp_47": "ar",
|
441 |
"speakers": 351664197,
|
442 |
"scores": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
{
|
444 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
445 |
"model_type": "text-to-text",
|
@@ -450,10 +543,43 @@
|
|
450 |
"overall_score": 0.6638448614180232
|
451 |
},
|
452 |
{
|
453 |
-
"model": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
"model_type": "speech-to-text",
|
455 |
-
"asr_wer": 0.
|
456 |
-
"overall_score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
}
|
458 |
],
|
459 |
"commonvoice_hours": 91.0,
|
@@ -499,11 +625,270 @@
|
|
499 |
"YE": 22114456
|
500 |
},
|
501 |
"language_family": "Afro-Asiatic",
|
502 |
-
"mt_bleu": 0.
|
503 |
-
"mt_chrf":
|
504 |
-
"cls_acc": 0.
|
505 |
-
"mlm_chrf": 94.
|
506 |
-
"asr_wer": 0.
|
507 |
-
"overall_score": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
}
|
509 |
]
|
|
|
56 |
"overall_score": 0.34916319968417603
|
57 |
},
|
58 |
{
|
59 |
+
"model": "openai/whisper-large-v3",
|
60 |
"model_type": "speech-to-text",
|
61 |
+
"asr_wer": 0.25418986127300397,
|
62 |
+
"overall_score": 0.25418986127300397
|
63 |
}
|
64 |
],
|
65 |
"commonvoice_hours": 2651.0,
|
|
|
226 |
"mt_chrf": 65.91917919929946,
|
227 |
"cls_acc": 0.6533333333333333,
|
228 |
"mlm_chrf": 93.48244773503015,
|
229 |
+
"asr_wer": 0.30167653047859,
|
230 |
+
"overall_score": 0.6212765331549852
|
231 |
},
|
232 |
{
|
233 |
"language_name": "Chinese",
|
|
|
286 |
"overall_score": 1.0
|
287 |
},
|
288 |
{
|
289 |
+
"model": "openai/whisper-large-v3",
|
290 |
"model_type": "speech-to-text",
|
291 |
"asr_wer": 1.0,
|
292 |
"overall_score": 1.0
|
|
|
329 |
"bcp_47": "hi",
|
330 |
"speakers": 546882144,
|
331 |
"scores": [
|
332 |
+
{
|
333 |
+
"model": "openai/gpt-4o-mini",
|
334 |
+
"model_type": "text-to-text",
|
335 |
+
"mt_bleu": 0.3647010036099328,
|
336 |
+
"mt_chrf": 55.294100726869324,
|
337 |
+
"cls_acc": 0.5,
|
338 |
+
"mlm_chrf": 93.01665261992896,
|
339 |
+
"overall_score": 0.6610358444893277
|
340 |
+
},
|
341 |
{
|
342 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
343 |
"model_type": "text-to-text",
|
|
|
348 |
"overall_score": 0.6657108425749162
|
349 |
},
|
350 |
{
|
351 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
352 |
+
"model_type": "text-to-text",
|
353 |
+
"mt_bleu": 0.346304931512649,
|
354 |
+
"mt_chrf": 52.59738987149503,
|
355 |
+
"cls_acc": 0.5,
|
356 |
+
"mlm_chrf": 90.59820426203353,
|
357 |
+
"overall_score": 0.6439853137784286
|
358 |
+
},
|
359 |
+
{
|
360 |
+
"model": "google/gemini-2.0-flash-001",
|
361 |
+
"model_type": "text-to-text",
|
362 |
+
"mt_bleu": 0.43820082761650603,
|
363 |
+
"mt_chrf": 61.68241787594198,
|
364 |
+
"cls_acc": 0.8666666666666667,
|
365 |
+
"mlm_chrf": 96.5534261573122,
|
366 |
+
"overall_score": 0.8163417023330695
|
367 |
+
},
|
368 |
+
{
|
369 |
+
"model": "microsoft/phi-4",
|
370 |
+
"model_type": "text-to-text",
|
371 |
+
"mt_bleu": 0.388038924689894,
|
372 |
+
"mt_chrf": 56.13147190849697,
|
373 |
+
"cls_acc": 0.8,
|
374 |
+
"mlm_chrf": 94.47772196307584,
|
375 |
+
"overall_score": 0.7686973129052427
|
376 |
+
},
|
377 |
+
{
|
378 |
+
"model": "elevenlabs/scribe_v1",
|
379 |
"model_type": "speech-to-text",
|
380 |
+
"asr_wer": 0.2338948365728121,
|
381 |
+
"overall_score": 0.2338948365728121
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"model": "openai/whisper-large-v3",
|
385 |
+
"model_type": "speech-to-text",
|
386 |
+
"asr_wer": 0.43522263872986894,
|
387 |
+
"overall_score": 0.43522263872986894
|
388 |
}
|
389 |
],
|
390 |
"commonvoice_hours": 16.0,
|
|
|
398 |
"ZA": 1129272
|
399 |
},
|
400 |
"language_family": "Indo-European",
|
401 |
+
"mt_bleu": 0.3722589421671684,
|
402 |
+
"mt_chrf": 55.849246285459195,
|
403 |
+
"cls_acc": 0.6333333333333334,
|
404 |
+
"mlm_chrf": 94.16368134606655,
|
405 |
+
"asr_wer": 0.33455873765134053,
|
406 |
+
"overall_score": 0.6035554987690951
|
407 |
},
|
408 |
{
|
409 |
"language_name": "Spanish",
|
410 |
"bcp_47": "es",
|
411 |
"speakers": 493528077,
|
412 |
"scores": [
|
413 |
+
{
|
414 |
+
"model": "openai/gpt-4o-mini",
|
415 |
+
"model_type": "text-to-text",
|
416 |
+
"mt_bleu": 0.3313892289629975,
|
417 |
+
"mt_chrf": 52.582336167000754,
|
418 |
+
"cls_acc": 0.6,
|
419 |
+
"mlm_chrf": 96.34068599793908,
|
420 |
+
"overall_score": 0.6964100738831327
|
421 |
+
},
|
422 |
{
|
423 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
424 |
"model_type": "text-to-text",
|
|
|
429 |
"overall_score": 0.6635684659512185
|
430 |
},
|
431 |
{
|
432 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
433 |
+
"model_type": "text-to-text",
|
434 |
+
"mt_bleu": 0.3120516523386294,
|
435 |
+
"mt_chrf": 50.14036662376262,
|
436 |
+
"cls_acc": 0.6333333333333333,
|
437 |
+
"mlm_chrf": 93.9463761308063,
|
438 |
+
"overall_score": 0.6914002536263407
|
439 |
+
},
|
440 |
+
{
|
441 |
+
"model": "google/gemini-2.0-flash-001",
|
442 |
+
"model_type": "text-to-text",
|
443 |
+
"mt_bleu": 0.3318101035886167,
|
444 |
+
"mt_chrf": 53.80973313056067,
|
445 |
+
"cls_acc": 0.8666666666666667,
|
446 |
+
"mlm_chrf": 96.46210398041012,
|
447 |
+
"overall_score": 0.7897950125921249
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"model": "microsoft/phi-4",
|
451 |
+
"model_type": "text-to-text",
|
452 |
+
"mt_bleu": 0.2808051746473726,
|
453 |
+
"mt_chrf": 49.858821063770044,
|
454 |
+
"cls_acc": 0.7666666666666667,
|
455 |
+
"mlm_chrf": 96.28984542933327,
|
456 |
+
"overall_score": 0.7427177771992333
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"model": "elevenlabs/scribe_v1",
|
460 |
"model_type": "speech-to-text",
|
461 |
+
"asr_wer": 0.19653905528613333,
|
462 |
+
"overall_score": 0.19653905528613333
|
463 |
+
},
|
464 |
+
{
|
465 |
+
"model": "openai/whisper-large-v3",
|
466 |
+
"model_type": "speech-to-text",
|
467 |
+
"asr_wer": 0.17561491933862197,
|
468 |
+
"overall_score": 0.17561491933862197
|
469 |
}
|
470 |
],
|
471 |
"commonvoice_hours": 446.0,
|
|
|
512 |
"VE": 23488572
|
513 |
},
|
514 |
"language_family": "Indo-European",
|
515 |
+
"mt_bleu": 0.31438710613980736,
|
516 |
+
"mt_chrf": 51.70682164947916,
|
517 |
+
"cls_acc": 0.6733333333333333,
|
518 |
+
"mlm_chrf": 95.99334001231053,
|
519 |
+
"asr_wer": 0.18607698731237765,
|
520 |
+
"overall_score": 0.5651493654109723
|
521 |
},
|
522 |
{
|
523 |
"language_name": "Arabic",
|
524 |
"bcp_47": "ar",
|
525 |
"speakers": 351664197,
|
526 |
"scores": [
|
527 |
+
{
|
528 |
+
"model": "openai/gpt-4o-mini",
|
529 |
+
"model_type": "text-to-text",
|
530 |
+
"mt_bleu": 0.312307215788044,
|
531 |
+
"mt_chrf": 51.50028782321851,
|
532 |
+
"cls_acc": 0.6,
|
533 |
+
"mlm_chrf": 96.57677413527875,
|
534 |
+
"overall_score": 0.6935902065283241
|
535 |
+
},
|
536 |
{
|
537 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
538 |
"model_type": "text-to-text",
|
|
|
543 |
"overall_score": 0.6638448614180232
|
544 |
},
|
545 |
{
|
546 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
547 |
+
"model_type": "text-to-text",
|
548 |
+
"mt_bleu": 0.26940670420361396,
|
549 |
+
"mt_chrf": 46.07654794208784,
|
550 |
+
"cls_acc": 0.7,
|
551 |
+
"mlm_chrf": 90.92876728332028,
|
552 |
+
"overall_score": 0.690017717418027
|
553 |
+
},
|
554 |
+
{
|
555 |
+
"model": "google/gemini-2.0-flash-001",
|
556 |
+
"model_type": "text-to-text",
|
557 |
+
"mt_bleu": 0.3928034519711188,
|
558 |
+
"mt_chrf": 57.47507639595937,
|
559 |
+
"cls_acc": 0.9,
|
560 |
+
"mlm_chrf": 96.82551362297947,
|
561 |
+
"overall_score": 0.8143353000631294
|
562 |
+
},
|
563 |
+
{
|
564 |
+
"model": "microsoft/phi-4",
|
565 |
+
"model_type": "text-to-text",
|
566 |
+
"mt_bleu": 0.2700754695807907,
|
567 |
+
"mt_chrf": 45.896533062231306,
|
568 |
+
"cls_acc": 0.7,
|
569 |
+
"mlm_chrf": 93.60903960225609,
|
570 |
+
"overall_score": 0.6983519088816247
|
571 |
+
},
|
572 |
+
{
|
573 |
+
"model": "elevenlabs/scribe_v1",
|
574 |
"model_type": "speech-to-text",
|
575 |
+
"asr_wer": 0.2685436379713873,
|
576 |
+
"overall_score": 0.2685436379713873
|
577 |
+
},
|
578 |
+
{
|
579 |
+
"model": "openai/whisper-large-v3",
|
580 |
+
"model_type": "speech-to-text",
|
581 |
+
"asr_wer": 0.17370718156523782,
|
582 |
+
"overall_score": 0.17370718156523782
|
583 |
}
|
584 |
],
|
585 |
"commonvoice_hours": 91.0,
|
|
|
625 |
"YE": 22114456
|
626 |
},
|
627 |
"language_family": "Afro-Asiatic",
|
628 |
+
"mt_bleu": 0.32801443817625614,
|
629 |
+
"mt_chrf": 51.692994508571644,
|
630 |
+
"cls_acc": 0.6733333333333332,
|
631 |
+
"mlm_chrf": 94.58207181664275,
|
632 |
+
"asr_wer": 0.22112540976831258,
|
633 |
+
"overall_score": 0.5717701162636791
|
634 |
+
},
|
635 |
+
{
|
636 |
+
"language_name": "Urdu",
|
637 |
+
"bcp_47": "ur",
|
638 |
+
"speakers": 290790290,
|
639 |
+
"scores": [
|
640 |
+
{
|
641 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
642 |
+
"model_type": "text-to-text",
|
643 |
+
"mt_bleu": 0.26585004461425726,
|
644 |
+
"mt_chrf": 47.37157150967947,
|
645 |
+
"cls_acc": 0.43333333333333335,
|
646 |
+
"mlm_chrf": 94.38802161979918,
|
647 |
+
"overall_score": 0.6169764215427066
|
648 |
+
},
|
649 |
+
{
|
650 |
+
"model": "elevenlabs/scribe_v1",
|
651 |
+
"model_type": "speech-to-text",
|
652 |
+
"asr_wer": 0.2982973325975355,
|
653 |
+
"overall_score": 0.2982973325975355
|
654 |
+
}
|
655 |
+
],
|
656 |
+
"commonvoice_hours": 77.0,
|
657 |
+
"commonvoice_locale": "ur",
|
658 |
+
"population": {
|
659 |
+
"CA": 286475,
|
660 |
+
"GB": 2301638,
|
661 |
+
"IN": 66304500,
|
662 |
+
"MU": 71727,
|
663 |
+
"PK": 221825950
|
664 |
+
},
|
665 |
+
"language_family": "Indo-European",
|
666 |
+
"mt_bleu": 0.26585004461425726,
|
667 |
+
"mt_chrf": 47.37157150967947,
|
668 |
+
"cls_acc": 0.43333333333333335,
|
669 |
+
"mlm_chrf": 94.38802161979918,
|
670 |
+
"asr_wer": 0.2982973325975355,
|
671 |
+
"overall_score": 0.457636877070121
|
672 |
+
},
|
673 |
+
{
|
674 |
+
"language_name": "French",
|
675 |
+
"bcp_47": "fr",
|
676 |
+
"speakers": 278611507,
|
677 |
+
"scores": [
|
678 |
+
{
|
679 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
680 |
+
"model_type": "text-to-text",
|
681 |
+
"mt_bleu": 0.3510210872150948,
|
682 |
+
"mt_chrf": 55.795595938804894,
|
683 |
+
"cls_acc": 0.5666666666666667,
|
684 |
+
"mlm_chrf": 97.12318847922649,
|
685 |
+
"overall_score": 0.6986181702823268
|
686 |
+
},
|
687 |
+
{
|
688 |
+
"model": "elevenlabs/scribe_v1",
|
689 |
+
"model_type": "speech-to-text",
|
690 |
+
"asr_wer": 0.2610754929736961,
|
691 |
+
"overall_score": 0.2610754929736961
|
692 |
+
}
|
693 |
+
],
|
694 |
+
"commonvoice_hours": 1052.0,
|
695 |
+
"commonvoice_locale": "fr",
|
696 |
+
"population": {
|
697 |
+
"AD": 5775,
|
698 |
+
"AT": 974540,
|
699 |
+
"BE": 4453866,
|
700 |
+
"BF": 4583788,
|
701 |
+
"BI": 7000822,
|
702 |
+
"BJ": 4502610,
|
703 |
+
"BL": 6837,
|
704 |
+
"CA": 11308230,
|
705 |
+
"CD": 3867640,
|
706 |
+
"CF": 2935521,
|
707 |
+
"CG": 4446179,
|
708 |
+
"CH": 1764838,
|
709 |
+
"CI": 13465739,
|
710 |
+
"CM": 18866600,
|
711 |
+
"CY": 88668,
|
712 |
+
"DE": 14428746,
|
713 |
+
"DJ": 19358,
|
714 |
+
"DZ": 8594580,
|
715 |
+
"FR": 67169718,
|
716 |
+
"GA": 1405473,
|
717 |
+
"GB": 15125053,
|
718 |
+
"GF": 153622,
|
719 |
+
"GN": 3632946,
|
720 |
+
"GP": 407498,
|
721 |
+
"GQ": 73584,
|
722 |
+
"GR": 954639,
|
723 |
+
"HT": 520187,
|
724 |
+
"HU": 293155,
|
725 |
+
"IE": 880017,
|
726 |
+
"IT": 3931370,
|
727 |
+
"KM": 473917,
|
728 |
+
"LB": 20238,
|
729 |
+
"LU": 546691,
|
730 |
+
"MA": 7112340,
|
731 |
+
"MC": 38610,
|
732 |
+
"MF": 32556,
|
733 |
+
"MG": 18599433,
|
734 |
+
"ML": 8994564,
|
735 |
+
"MQ": 427408,
|
736 |
+
"MR": 680932,
|
737 |
+
"MT": 50299,
|
738 |
+
"MU": 41381,
|
739 |
+
"NC": 278409,
|
740 |
+
"NE": 6603996,
|
741 |
+
"NL": 5011316,
|
742 |
+
"PF": 180024,
|
743 |
+
"PM": 5133,
|
744 |
+
"PT": 1545405,
|
745 |
+
"RE": 700950,
|
746 |
+
"RO": 3621493,
|
747 |
+
"RW": 2288,
|
748 |
+
"SC": 57589,
|
749 |
+
"SN": 6137196,
|
750 |
+
"SY": 1144506,
|
751 |
+
"TD": 4388124,
|
752 |
+
"TF": 140,
|
753 |
+
"TG": 5251148,
|
754 |
+
"TN": 8673688,
|
755 |
+
"US": 1862778,
|
756 |
+
"VU": 149166,
|
757 |
+
"WF": 7610,
|
758 |
+
"YT": 110580
|
759 |
+
},
|
760 |
+
"language_family": "Indo-European",
|
761 |
+
"mt_bleu": 0.3510210872150948,
|
762 |
+
"mt_chrf": 55.795595938804894,
|
763 |
+
"cls_acc": 0.5666666666666667,
|
764 |
+
"mlm_chrf": 97.12318847922649,
|
765 |
+
"asr_wer": 0.2610754929736961,
|
766 |
+
"overall_score": 0.47984683162801145
|
767 |
+
},
|
768 |
+
{
|
769 |
+
"language_name": "Bangla",
|
770 |
+
"bcp_47": "bn",
|
771 |
+
"speakers": 267193288,
|
772 |
+
"scores": [
|
773 |
+
{
|
774 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
775 |
+
"model_type": "text-to-text",
|
776 |
+
"mt_bleu": 0.2874920154082786,
|
777 |
+
"mt_chrf": 51.49279116112809,
|
778 |
+
"cls_acc": 0.4,
|
779 |
+
"mlm_chrf": 90.6067262108039,
|
780 |
+
"overall_score": 0.6069983912397733
|
781 |
+
},
|
782 |
+
{
|
783 |
+
"model": "elevenlabs/scribe_v1",
|
784 |
+
"model_type": "speech-to-text",
|
785 |
+
"asr_wer": 0.26686188207927336,
|
786 |
+
"overall_score": 0.26686188207927336
|
787 |
+
}
|
788 |
+
],
|
789 |
+
"commonvoice_hours": 49.0,
|
790 |
+
"commonvoice_locale": "bn",
|
791 |
+
"population": {
|
792 |
+
"BD": 159397980,
|
793 |
+
"CA": 90466,
|
794 |
+
"GB": 263044,
|
795 |
+
"IN": 107413290,
|
796 |
+
"NP": 28508
|
797 |
+
},
|
798 |
+
"language_family": "Indo-European",
|
799 |
+
"mt_bleu": 0.2874920154082786,
|
800 |
+
"mt_chrf": 51.49279116112809,
|
801 |
+
"cls_acc": 0.4,
|
802 |
+
"mlm_chrf": 90.6067262108039,
|
803 |
+
"asr_wer": 0.26686188207927336,
|
804 |
+
"overall_score": 0.4369301366595233
|
805 |
+
},
|
806 |
+
{
|
807 |
+
"language_name": "Portuguese",
|
808 |
+
"bcp_47": "pt",
|
809 |
+
"speakers": 237496885,
|
810 |
+
"scores": [
|
811 |
+
{
|
812 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
813 |
+
"model_type": "text-to-text",
|
814 |
+
"mt_bleu": 0.33491649454450034,
|
815 |
+
"mt_chrf": 54.60211868234021,
|
816 |
+
"cls_acc": 0.5666666666666667,
|
817 |
+
"mlm_chrf": 96.52676764996336,
|
818 |
+
"overall_score": 0.6926518433299008
|
819 |
+
},
|
820 |
+
{
|
821 |
+
"model": "elevenlabs/scribe_v1",
|
822 |
+
"model_type": "speech-to-text",
|
823 |
+
"asr_wer": 0.22967756370402836,
|
824 |
+
"overall_score": 0.22967756370402836
|
825 |
+
}
|
826 |
+
],
|
827 |
+
"commonvoice_hours": 177.0,
|
828 |
+
"commonvoice_locale": "pt",
|
829 |
+
"population": {
|
830 |
+
"AG": 1571,
|
831 |
+
"AO": 21789941,
|
832 |
+
"BR": 192661560,
|
833 |
+
"CA": 229934,
|
834 |
+
"CH": 285736,
|
835 |
+
"CV": 443274,
|
836 |
+
"FR": 882027,
|
837 |
+
"GB": 131522,
|
838 |
+
"GQ": 1,
|
839 |
+
"GW": 1927100,
|
840 |
+
"LU": 100541,
|
841 |
+
"MO": 30723,
|
842 |
+
"MZ": 8126514,
|
843 |
+
"PT": 9890592,
|
844 |
+
"ST": 179454,
|
845 |
+
"TL": 816395
|
846 |
+
},
|
847 |
+
"language_family": "Indo-European",
|
848 |
+
"mt_bleu": 0.33491649454450034,
|
849 |
+
"mt_chrf": 54.60211868234021,
|
850 |
+
"cls_acc": 0.5666666666666667,
|
851 |
+
"mlm_chrf": 96.52676764996336,
|
852 |
+
"asr_wer": 0.22967756370402836,
|
853 |
+
"overall_score": 0.4611647035169646
|
854 |
+
},
|
855 |
+
{
|
856 |
+
"language_name": "Punjabi",
|
857 |
+
"bcp_47": "pa",
|
858 |
+
"speakers": 203571210,
|
859 |
+
"scores": [
|
860 |
+
{
|
861 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
862 |
+
"model_type": "text-to-text",
|
863 |
+
"mt_bleu": 0.3078917767345886,
|
864 |
+
"mt_chrf": 50.505686987696365,
|
865 |
+
"cls_acc": 0.5333333333333333,
|
866 |
+
"mlm_chrf": 90.10119297923285,
|
867 |
+
"overall_score": 0.6464673776675418
|
868 |
+
},
|
869 |
+
{
|
870 |
+
"model": "elevenlabs/scribe_v1",
|
871 |
+
"model_type": "speech-to-text",
|
872 |
+
"asr_wer": 0.20953788908863977,
|
873 |
+
"overall_score": 0.20953788908863977
|
874 |
+
}
|
875 |
+
],
|
876 |
+
"commonvoice_hours": 2.3,
|
877 |
+
"commonvoice_locale": "pa-IN",
|
878 |
+
"population": {
|
879 |
+
"CA": 603106,
|
880 |
+
"GB": 2367400,
|
881 |
+
"IN": 37130520,
|
882 |
+
"KE": 10170,
|
883 |
+
"PK": 163450700,
|
884 |
+
"SG": 9314
|
885 |
+
},
|
886 |
+
"language_family": "Indo-European",
|
887 |
+
"mt_bleu": 0.3078917767345886,
|
888 |
+
"mt_chrf": 50.505686987696365,
|
889 |
+
"cls_acc": 0.5333333333333333,
|
890 |
+
"mlm_chrf": 90.10119297923285,
|
891 |
+
"asr_wer": 0.20953788908863977,
|
892 |
+
"overall_score": 0.4280026333780908
|
893 |
}
|
894 |
]
|
uv.lock
CHANGED
@@ -1245,7 +1245,7 @@ dev = [
|
|
1245 |
[package.metadata]
|
1246 |
requires-dist = [
|
1247 |
{ name = "gradio", specifier = ">=5.16.2" },
|
1248 |
-
{ name = "language-data" },
|
1249 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1250 |
{ name = "plotly", specifier = ">=6.0.0" },
|
1251 |
{ name = "pycountry", specifier = ">=24.6.1" },
|
|
|
1245 |
[package.metadata]
|
1246 |
requires-dist = [
|
1247 |
{ name = "gradio", specifier = ">=5.16.2" },
|
1248 |
+
{ name = "language-data", specifier = ">=1.3.0" },
|
1249 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1250 |
{ name = "plotly", specifier = ">=6.0.0" },
|
1251 |
{ name = "pycountry", specifier = ">=24.6.1" },
|