David Pomerenke commited on
Commit
8beab26
·
1 Parent(s): df383f6

Add CommonVoice stats

Browse files

But matching of codes does not yet work well, e.g. zh-CN vs cmn

Files changed (4) hide show
  1. README.md +1 -0
  2. app.py +1 -0
  3. evals.py +19 -7
  4. results.json +60 -30
README.md CHANGED
@@ -8,6 +8,7 @@ license: cc-by-sa-4.0
8
  short_description: Evaluating LLM performance across all human languages.
9
  datasets:
10
  - openlanguagedata/flores_plus
 
11
  models:
12
  - meta-llama/Llama-3.3-70B-Instruct
13
  - mistralai/Mistral-Small-24B-Instruct-2501
 
8
  short_description: Evaluating LLM performance across all human languages.
9
  datasets:
10
  - openlanguagedata/flores_plus
11
+ - mozilla-foundation/common_voice_1_0
12
  models:
13
  - meta-llama/Llama-3.3-70B-Instruct
14
  - mistralai/Mistral-Small-24B-Instruct-2501
app.py CHANGED
@@ -189,6 +189,7 @@ def create_language_stats_df(results):
189
  "Best Model BLEU": round(best_score["bleu"], 3)
190
  if best_score["bleu"] is not None
191
  else "N/A",
 
192
  }
193
  flat_data.append(row)
194
 
 
189
  "Best Model BLEU": round(best_score["bleu"], 3)
190
  if best_score["bleu"] is not None
191
  else "N/A",
192
+ "CommonVoice Hours": lang["commonvoice_hours"],
193
  }
194
  flat_data.append(row)
195
 
evals.py CHANGED
@@ -12,6 +12,8 @@ from joblib.memory import Memory
12
  from openai import AsyncOpenAI
13
  from tqdm.asyncio import tqdm_asyncio
14
  from transformers import NllbTokenizer
 
 
15
 
16
  # config
17
  models = [
@@ -68,7 +70,7 @@ language_names = (
68
  language_stats = (
69
  pd.read_csv("data/languages.tsv", sep="\t")
70
  .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
71
- ["language_code", "speakers"]
72
  ]
73
  .dropna(subset=["language_code"])
74
  )
@@ -97,8 +99,15 @@ languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
97
  languages = languages.sort_values(by="speakers", ascending=False)
98
  languages = languages.iloc[:30]
99
 
 
 
 
 
 
 
 
100
  # sample languages to translate to
101
- target_languages_NEW = languages[languages["in_benchmark"]].sample(
102
  n=n_sentences, weights="speakers", replace=True, random_state=42
103
  )
104
  # sample languages to analyze with all models
@@ -170,7 +179,7 @@ async def main():
170
  print(name)
171
  scores = []
172
  if language.in_benchmark:
173
- original_sentences_NEW = load_sentences(language)[:n_sentences]
174
  for model in models:
175
  if (
176
  model != fast_model
@@ -184,16 +193,16 @@ async def main():
184
  translate(
185
  model, language.language_name, language.script_name, sentence
186
  )
187
- for sentence, language in zip(original_sentences_NEW, target_languages_NEW.itertuples())
188
  ]
189
  predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
190
- target_sentences_NEW = [
191
  load_sentences(lang)[i]
192
- for i, lang in enumerate(target_languages_NEW.itertuples())
193
  ]
194
  metrics_bleu = bleu.compute(
195
  predictions=predictions,
196
- references=target_sentences_NEW,
197
  tokenizer=tokenizer.tokenize,
198
  )
199
  # metrics_bert = bertscore.compute(
@@ -208,6 +217,8 @@ async def main():
208
  # "bert_score": mean(metrics_bert["f1"]),
209
  }
210
  )
 
 
211
  results.append(
212
  {
213
  "language_name": name,
@@ -216,6 +227,7 @@ async def main():
216
  "scores": scores,
217
  "bleu": mean([s["bleu"] for s in scores]) if scores else None,
218
  # "bert_score": mean([s["bert_score"] for s in scores]),
 
219
  }
220
  )
221
  with open("results.json", "w") as f:
 
12
  from openai import AsyncOpenAI
13
  from tqdm.asyncio import tqdm_asyncio
14
  from transformers import NllbTokenizer
15
+ from datetime import date
16
+ from requests import get
17
 
18
  # config
19
  models = [
 
70
  language_stats = (
71
  pd.read_csv("data/languages.tsv", sep="\t")
72
  .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
73
+ ["language_code", "speakers", "iso639_1"]
74
  ]
75
  .dropna(subset=["language_code"])
76
  )
 
99
  languages = languages.sort_values(by="speakers", ascending=False)
100
  languages = languages.iloc[:30]
101
 
102
+ # retrieve CommonVoice stats
103
+ @cache # cache for 1 day
104
+ def get_commonvoice_stats(date: date):
105
+ return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
106
+
107
+ commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today()))
108
+
109
  # sample languages to translate to
110
+ target_languages = languages[languages["in_benchmark"]].sample(
111
  n=n_sentences, weights="speakers", replace=True, random_state=42
112
  )
113
  # sample languages to analyze with all models
 
179
  print(name)
180
  scores = []
181
  if language.in_benchmark:
182
+ original_sentences = load_sentences(language)[:n_sentences]
183
  for model in models:
184
  if (
185
  model != fast_model
 
193
  translate(
194
  model, language.language_name, language.script_name, sentence
195
  )
196
+ for sentence, language in zip(original_sentences, target_languages.itertuples())
197
  ]
198
  predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
199
+ target_sentences = [
200
  load_sentences(lang)[i]
201
+ for i, lang in enumerate(target_languages.itertuples())
202
  ]
203
  metrics_bleu = bleu.compute(
204
  predictions=predictions,
205
+ references=target_sentences,
206
  tokenizer=tokenizer.tokenize,
207
  )
208
  # metrics_bert = bertscore.compute(
 
217
  # "bert_score": mean(metrics_bert["f1"]),
218
  }
219
  )
220
+ commonvoice_hours = commonvoice_stats[commonvoice_stats["locale"] == language.iso639_1]["validatedHours"].values
221
+ commonvoice_hours = commonvoice_hours[0] if commonvoice_hours.size > 0 else "N/A"
222
  results.append(
223
  {
224
  "language_name": name,
 
227
  "scores": scores,
228
  "bleu": mean([s["bleu"] for s in scores]) if scores else None,
229
  # "bert_score": mean([s["bert_score"] for s in scores]),
230
+ "commonvoice_hours": commonvoice_hours,
231
  }
232
  )
233
  with open("results.json", "w") as f:
results.json CHANGED
@@ -29,7 +29,8 @@
29
  "bleu": 0.44668905281921456
30
  }
31
  ],
32
- "bleu": 0.47384102687918905
 
33
  },
34
  {
35
  "language_name": "Mandarin Chinese",
@@ -41,7 +42,8 @@
41
  "bleu": 0.48254866511762295
42
  }
43
  ],
44
- "bleu": 0.48254866511762295
 
45
  },
46
  {
47
  "language_name": "Spanish",
@@ -53,7 +55,8 @@
53
  "bleu": 0.31606621368361204
54
  }
55
  ],
56
- "bleu": 0.31606621368361204
 
57
  },
58
  {
59
  "language_name": "Hindi",
@@ -65,7 +68,8 @@
65
  "bleu": 0.3273225856613046
66
  }
67
  ],
68
- "bleu": 0.3273225856613046
 
69
  },
70
  {
71
  "language_name": "Bengali",
@@ -77,7 +81,8 @@
77
  "bleu": 0.23110496173302814
78
  }
79
  ],
80
- "bleu": 0.23110496173302814
 
81
  },
82
  {
83
  "language_name": "Portuguese",
@@ -89,7 +94,8 @@
89
  "bleu": 0.35032125995743685
90
  }
91
  ],
92
- "bleu": 0.35032125995743685
 
93
  },
94
  {
95
  "language_name": "French",
@@ -101,7 +107,8 @@
101
  "bleu": 0.31625053573185663
102
  }
103
  ],
104
- "bleu": 0.31625053573185663
 
105
  },
106
  {
107
  "language_name": "Indonesian",
@@ -113,7 +120,8 @@
113
  "bleu": 0.3112185444311794
114
  }
115
  ],
116
- "bleu": 0.3112185444311794
 
117
  },
118
  {
119
  "language_name": "Russian",
@@ -145,7 +153,8 @@
145
  "bleu": 0.31289371159965956
146
  }
147
  ],
148
- "bleu": 0.3346024224541269
 
149
  },
150
  {
151
  "language_name": "Japanese",
@@ -177,7 +186,8 @@
177
  "bleu": 0.2585222780278109
178
  }
179
  ],
180
- "bleu": 0.2790237571605942
 
181
  },
182
  {
183
  "language_name": "Eastern Punjabi",
@@ -189,7 +199,8 @@
189
  "bleu": 0.27325501919134315
190
  }
191
  ],
192
- "bleu": 0.27325501919134315
 
193
  },
194
  {
195
  "language_name": "Standard German",
@@ -221,7 +232,8 @@
221
  "bleu": 0.36047992103881465
222
  }
223
  ],
224
- "bleu": 0.3898869846770727
 
225
  },
226
  {
227
  "language_name": "Egyptian Arabic",
@@ -253,7 +265,8 @@
253
  "bleu": 0.19969813973959594
254
  }
255
  ],
256
- "bleu": 0.23482952277259375
 
257
  },
258
  {
259
  "language_name": "Urdu",
@@ -285,7 +298,8 @@
285
  "bleu": 0.2285337340113323
286
  }
287
  ],
288
- "bleu": 0.2690020545084802
 
289
  },
290
  {
291
  "language_name": "Filipino",
@@ -297,7 +311,8 @@
297
  "bleu": 0.33268969497468076
298
  }
299
  ],
300
- "bleu": 0.33268969497468076
 
301
  },
302
  {
303
  "language_name": "Javanese",
@@ -309,7 +324,8 @@
309
  "bleu": 0.2528746866064681
310
  }
311
  ],
312
- "bleu": 0.2528746866064681
 
313
  },
314
  {
315
  "language_name": "Marathi",
@@ -321,7 +337,8 @@
321
  "bleu": 0.24876051941895777
322
  }
323
  ],
324
- "bleu": 0.24876051941895777
 
325
  },
326
  {
327
  "language_name": "Swahili",
@@ -353,7 +370,8 @@
353
  "bleu": 0.21803176063271826
354
  }
355
  ],
356
- "bleu": 0.3070798470243923
 
357
  },
358
  {
359
  "language_name": "Turkish",
@@ -365,7 +383,8 @@
365
  "bleu": 0.29874140544434125
366
  }
367
  ],
368
- "bleu": 0.29874140544434125
 
369
  },
370
  {
371
  "language_name": "Telugu",
@@ -377,14 +396,16 @@
377
  "bleu": 0.28869836899054496
378
  }
379
  ],
380
- "bleu": 0.28869836899054496
 
381
  },
382
  {
383
  "language_name": "Wu Chinese",
384
  "language_code": "wuu",
385
  "speakers": 81400000.0,
386
  "scores": [],
387
- "bleu": null
 
388
  },
389
  {
390
  "language_name": "Korean",
@@ -396,7 +417,8 @@
396
  "bleu": 0.2566453806044083
397
  }
398
  ],
399
- "bleu": 0.2566453806044083
 
400
  },
401
  {
402
  "language_name": "Vietnamese",
@@ -428,7 +450,8 @@
428
  "bleu": 0.18355331419148843
429
  }
430
  ],
431
- "bleu": 0.3011065238905742
 
432
  },
433
  {
434
  "language_name": "Tamil",
@@ -460,7 +483,8 @@
460
  "bleu": 0.12646276530642359
461
  }
462
  ],
463
- "bleu": 0.23483954884287706
 
464
  },
465
  {
466
  "language_name": "Yue Chinese",
@@ -472,7 +496,8 @@
472
  "bleu": 0.2663995648378034
473
  }
474
  ],
475
- "bleu": 0.2663995648378034
 
476
  },
477
  {
478
  "language_name": "Italian",
@@ -484,7 +509,8 @@
484
  "bleu": 0.3190660116366235
485
  }
486
  ],
487
- "bleu": 0.3190660116366235
 
488
  },
489
  {
490
  "language_name": "Gujarati",
@@ -516,7 +542,8 @@
516
  "bleu": 0.19669824113063106
517
  }
518
  ],
519
- "bleu": 0.2589873172783296
 
520
  },
521
  {
522
  "language_name": "Iranian Persian",
@@ -528,7 +555,8 @@
528
  "bleu": 0.28359916806993934
529
  }
530
  ],
531
- "bleu": 0.28359916806993934
 
532
  },
533
  {
534
  "language_name": "Bhojpuri",
@@ -540,13 +568,15 @@
540
  "bleu": 0.24311504988281543
541
  }
542
  ],
543
- "bleu": 0.24311504988281543
 
544
  },
545
  {
546
  "language_name": "Hakka Chinese",
547
  "language_code": "hak",
548
  "speakers": 48200000.0,
549
  "scores": [],
550
- "bleu": null
 
551
  }
552
  ]
 
29
  "bleu": 0.44668905281921456
30
  }
31
  ],
32
+ "bleu": 0.47384102687918905,
33
+ "commonvoice_hours": 2649.0
34
  },
35
  {
36
  "language_name": "Mandarin Chinese",
 
42
  "bleu": 0.48254866511762295
43
  }
44
  ],
45
+ "bleu": 0.48254866511762295,
46
+ "commonvoice_hours": "N/A"
47
  },
48
  {
49
  "language_name": "Spanish",
 
55
  "bleu": 0.31606621368361204
56
  }
57
  ],
58
+ "bleu": 0.31606621368361204,
59
+ "commonvoice_hours": 446.0
60
  },
61
  {
62
  "language_name": "Hindi",
 
68
  "bleu": 0.3273225856613046
69
  }
70
  ],
71
+ "bleu": 0.3273225856613046,
72
+ "commonvoice_hours": 16.0
73
  },
74
  {
75
  "language_name": "Bengali",
 
81
  "bleu": 0.23110496173302814
82
  }
83
  ],
84
+ "bleu": 0.23110496173302814,
85
+ "commonvoice_hours": 49.0
86
  },
87
  {
88
  "language_name": "Portuguese",
 
94
  "bleu": 0.35032125995743685
95
  }
96
  ],
97
+ "bleu": 0.35032125995743685,
98
+ "commonvoice_hours": 176.0
99
  },
100
  {
101
  "language_name": "French",
 
107
  "bleu": 0.31625053573185663
108
  }
109
  ],
110
+ "bleu": 0.31625053573185663,
111
+ "commonvoice_hours": 1051.0
112
  },
113
  {
114
  "language_name": "Indonesian",
 
120
  "bleu": 0.3112185444311794
121
  }
122
  ],
123
+ "bleu": 0.3112185444311794,
124
+ "commonvoice_hours": 33.0
125
  },
126
  {
127
  "language_name": "Russian",
 
153
  "bleu": 0.31289371159965956
154
  }
155
  ],
156
+ "bleu": 0.3346024224541269,
157
+ "commonvoice_hours": 241.0
158
  },
159
  {
160
  "language_name": "Japanese",
 
186
  "bleu": 0.2585222780278109
187
  }
188
  ],
189
+ "bleu": 0.2790237571605942,
190
+ "commonvoice_hours": 222.0
191
  },
192
  {
193
  "language_name": "Eastern Punjabi",
 
199
  "bleu": 0.27325501919134315
200
  }
201
  ],
202
+ "bleu": 0.27325501919134315,
203
+ "commonvoice_hours": "N/A"
204
  },
205
  {
206
  "language_name": "Standard German",
 
232
  "bleu": 0.36047992103881465
233
  }
234
  ],
235
+ "bleu": 0.3898869846770727,
236
+ "commonvoice_hours": 1357.0
237
  },
238
  {
239
  "language_name": "Egyptian Arabic",
 
265
  "bleu": 0.19969813973959594
266
  }
267
  ],
268
+ "bleu": 0.23482952277259375,
269
+ "commonvoice_hours": "N/A"
270
  },
271
  {
272
  "language_name": "Urdu",
 
298
  "bleu": 0.2285337340113323
299
  }
300
  ],
301
+ "bleu": 0.2690020545084802,
302
+ "commonvoice_hours": 76.0
303
  },
304
  {
305
  "language_name": "Filipino",
 
311
  "bleu": 0.33268969497468076
312
  }
313
  ],
314
+ "bleu": 0.33268969497468076,
315
+ "commonvoice_hours": "N/A"
316
  },
317
  {
318
  "language_name": "Javanese",
 
324
  "bleu": 0.2528746866064681
325
  }
326
  ],
327
+ "bleu": 0.2528746866064681,
328
+ "commonvoice_hours": 0.0
329
  },
330
  {
331
  "language_name": "Marathi",
 
337
  "bleu": 0.24876051941895777
338
  }
339
  ],
340
+ "bleu": 0.24876051941895777,
341
+ "commonvoice_hours": 20.0
342
  },
343
  {
344
  "language_name": "Swahili",
 
370
  "bleu": 0.21803176063271826
371
  }
372
  ],
373
+ "bleu": 0.3070798470243923,
374
+ "commonvoice_hours": "N/A"
375
  },
376
  {
377
  "language_name": "Turkish",
 
383
  "bleu": 0.29874140544434125
384
  }
385
  ],
386
+ "bleu": 0.29874140544434125,
387
+ "commonvoice_hours": 127.0
388
  },
389
  {
390
  "language_name": "Telugu",
 
396
  "bleu": 0.28869836899054496
397
  }
398
  ],
399
+ "bleu": 0.28869836899054496,
400
+ "commonvoice_hours": 0.3
401
  },
402
  {
403
  "language_name": "Wu Chinese",
404
  "language_code": "wuu",
405
  "speakers": 81400000.0,
406
  "scores": [],
407
+ "bleu": null,
408
+ "commonvoice_hours": "N/A"
409
  },
410
  {
411
  "language_name": "Korean",
 
417
  "bleu": 0.2566453806044083
418
  }
419
  ],
420
+ "bleu": 0.2566453806044083,
421
+ "commonvoice_hours": 1.7
422
  },
423
  {
424
  "language_name": "Vietnamese",
 
450
  "bleu": 0.18355331419148843
451
  }
452
  ],
453
+ "bleu": 0.3011065238905742,
454
+ "commonvoice_hours": 5.9
455
  },
456
  {
457
  "language_name": "Tamil",
 
483
  "bleu": 0.12646276530642359
484
  }
485
  ],
486
+ "bleu": 0.23483954884287706,
487
+ "commonvoice_hours": 234.0
488
  },
489
  {
490
  "language_name": "Yue Chinese",
 
496
  "bleu": 0.2663995648378034
497
  }
498
  ],
499
+ "bleu": 0.2663995648378034,
500
+ "commonvoice_hours": "N/A"
501
  },
502
  {
503
  "language_name": "Italian",
 
509
  "bleu": 0.3190660116366235
510
  }
511
  ],
512
+ "bleu": 0.3190660116366235,
513
+ "commonvoice_hours": 362.0
514
  },
515
  {
516
  "language_name": "Gujarati",
 
542
  "bleu": 0.19669824113063106
543
  }
544
  ],
545
+ "bleu": 0.2589873172783296,
546
+ "commonvoice_hours": "N/A"
547
  },
548
  {
549
  "language_name": "Iranian Persian",
 
555
  "bleu": 0.28359916806993934
556
  }
557
  ],
558
+ "bleu": 0.28359916806993934,
559
+ "commonvoice_hours": "N/A"
560
  },
561
  {
562
  "language_name": "Bhojpuri",
 
568
  "bleu": 0.24311504988281543
569
  }
570
  ],
571
+ "bleu": 0.24311504988281543,
572
+ "commonvoice_hours": "N/A"
573
  },
574
  {
575
  "language_name": "Hakka Chinese",
576
  "language_code": "hak",
577
  "speakers": 48200000.0,
578
  "scores": [],
579
+ "bleu": null,
580
+ "commonvoice_hours": "N/A"
581
  }
582
  ]