David Pomerenke commited on
Commit
56081d8
·
1 Parent(s): 8190782

Parallelize everything, select most populous script

Browse files
Files changed (2) hide show
  1. evals.py +91 -86
  2. results.json +456 -64
evals.py CHANGED
@@ -2,6 +2,7 @@ import asyncio
2
  import json
3
  import os
4
  import re
 
5
  from os import getenv
6
 
7
  import evaluate
@@ -10,13 +11,12 @@ import requests
10
  from aiolimiter import AsyncLimiter
11
  from dotenv import load_dotenv
12
  from joblib.memory import Memory
 
 
13
  from openai import AsyncOpenAI
 
14
  from tqdm.asyncio import tqdm_asyncio
15
  from transformers import NllbTokenizer
16
- from datetime import date
17
- from requests import get
18
- from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
19
- from langcodes import standardize_tag, Language
20
 
21
  # config
22
  models = [
@@ -40,17 +40,11 @@ client = AsyncOpenAI(
40
  )
41
  cache = Memory(location=".cache", verbose=0).cache
42
  bleu = evaluate.load("bleu")
43
- bertscore = evaluate.load("bertscore")
44
  tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
45
  rate_limit = AsyncLimiter(max_rate=20, time_period=1)
46
 
47
 
48
- def reorder(language_name):
49
- if "," in language_name and "(" not in language_name:
50
- return language_name.split(",")[1] + " " + language_name.split(",")[0]
51
- return language_name
52
-
53
-
54
  # load general language data
55
  languages = {
56
  lang: pop
@@ -58,7 +52,9 @@ languages = {
58
  if not re.match(r".*-[A-Z]{2}$", lang)
59
  }
60
  languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
61
- languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name())
 
 
62
 
63
  # load script codes and names
64
  scripts = pd.read_csv("data/ScriptCodes.csv").rename(
@@ -70,15 +66,26 @@ def script_name(iso15924):
70
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
71
 
72
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # load benchmark languages and scripts
74
  benchmark_dir = "data/floresp-v2.0-rc.3/dev"
75
  benchmark_languages = pd.DataFrame(
76
- [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
77
- columns=["iso639_3", "iso15924"],
78
  )
79
- benchmark_languages["bcp_47"] = benchmark_languages.apply(
80
- lambda row: standardize_tag(row["iso639_3"] + "-" + row["iso15924"], macro=True),
81
- axis=1,
82
  )
83
  # ignore script (language is language)
84
  benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
@@ -86,7 +93,7 @@ benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
86
  )
87
  benchmark_languages = (
88
  benchmark_languages.groupby("bcp_47")
89
- .agg({"iso639_3": "first", "iso15924": "first"})
90
  .reset_index()
91
  )
92
 
@@ -123,14 +130,14 @@ languages = pd.merge(
123
  languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
124
 
125
  languages = languages.sort_values(by="speakers", ascending=False)
126
- languages = languages.iloc[:10]
127
 
128
  # sample languages to translate to
129
  target_languages = languages[languages["in_benchmark"]].sample(
130
  n=n_sentences, weights="speakers", replace=True, random_state=42
131
  )
132
  # sample languages to analyze with all models
133
- detailed_languages = languages[languages["in_benchmark"]].sample(n=3, random_state=42)
134
 
135
 
136
  # utils
@@ -158,93 +165,91 @@ async def complete(**kwargs):
158
  return response
159
 
160
 
161
- async def translate(model, target_language, sentence):
162
- script = script_name(target_language.iso15924)
 
 
 
 
 
 
 
 
 
 
 
163
  reply = await complete(
164
  model=model,
165
  messages=[
166
  {
167
  "role": "user",
168
- "content": f"Translate the following text to the {target_language.name} language; use the {script} script; reply only with the translation:\n\n{sentence}",
169
  }
170
  ],
171
  temperature=0,
172
  max_tokens=1024,
173
  )
174
- return reply.choices[0].message.content
175
-
176
-
177
- def mean(l):
178
- return sum(l) / len(l) if l else 0
 
 
 
 
 
 
 
179
 
180
 
181
- def load_sentences(language):
182
- return open(
183
- f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}"
184
- ).readlines()
185
 
186
 
187
  # evaluation!
188
  async def main():
 
 
 
 
 
 
 
 
 
 
 
 
189
  results = []
190
- for language in list(languages.itertuples()):
191
- scores = []
192
- if language.in_benchmark:
193
- original_sentences = load_sentences(language)[:n_sentences]
194
- for model in models:
195
- if (
196
- model != fast_model
197
- and language.bcp_47 not in detailed_languages.bcp_47.values
198
- ):
199
- continue
200
- predictions = [
201
- translate(
202
- model,
203
- language,
204
- sentence,
205
- )
206
- for sentence, language in zip(
207
- original_sentences, target_languages.itertuples()
208
- )
209
- ]
210
- predictions = await tqdm_asyncio.gather(
211
- *predictions,
212
- miniters=1,
213
- desc=f"{language.name} {model.split('/')[0]}",
214
- )
215
- target_sentences = [
216
- load_sentences(lang)[i]
217
- for i, lang in enumerate(target_languages.itertuples())
218
- ]
219
- metrics_bleu = bleu.compute(
220
- predictions=predictions,
221
- references=target_sentences,
222
- tokenizer=tokenizer.tokenize,
223
- )
224
- # metrics_bert = bertscore.compute(
225
- # predictions=predictions,
226
- # references=target_sentences,
227
- # model_type="distilbert-base-uncased",
228
- # )
229
- scores.append(
230
  {
231
  "model": model,
232
- "bleu": metrics_bleu["bleu"],
233
- # "bert_score": mean(metrics_bert["f1"]),
234
  }
235
  )
236
- results.append(
237
- {
238
- "language_name": language.name,
239
- "bcp_47": language.bcp_47,
240
- "speakers": language.speakers if not pd.isna(language.speakers) else 0,
241
- "scores": scores,
242
- "bleu": mean([s["bleu"] for s in scores]) if scores else None,
243
- # "bert_score": mean([s["bert_score"] for s in scores]),
244
- "commonvoice_hours": language.commonvoice_hours,
245
- "commonvoice_locale": language.commonvoice_locale,
246
- }
247
- )
248
  with open("results.json", "w") as f:
249
  json.dump(results, f, indent=2, ensure_ascii=False)
250
 
 
2
  import json
3
  import os
4
  import re
5
+ from datetime import date
6
  from os import getenv
7
 
8
  import evaluate
 
11
  from aiolimiter import AsyncLimiter
12
  from dotenv import load_dotenv
13
  from joblib.memory import Memory
14
+ from langcodes import Language, standardize_tag
15
+ from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
16
  from openai import AsyncOpenAI
17
+ from requests import get
18
  from tqdm.asyncio import tqdm_asyncio
19
  from transformers import NllbTokenizer
 
 
 
 
20
 
21
  # config
22
  models = [
 
40
  )
41
  cache = Memory(location=".cache", verbose=0).cache
42
  bleu = evaluate.load("bleu")
43
+ # bertscore = evaluate.load("bertscore")
44
  tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
45
  rate_limit = AsyncLimiter(max_rate=20, time_period=1)
46
 
47
 
 
 
 
 
 
 
48
  # load general language data
49
  languages = {
50
  lang: pop
 
52
  if not re.match(r".*-[A-Z]{2}$", lang)
53
  }
54
  languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
55
+ languages["language_name"] = languages["bcp_47"].apply(
56
+ lambda x: Language.get(x).display_name()
57
+ )
58
 
59
  # load script codes and names
60
  scripts = pd.read_csv("data/ScriptCodes.csv").rename(
 
66
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
67
 
68
 
69
+ def aggregate_flores_paths(flores_paths):
70
+ # takes a list of paths from the same language but different scripts
71
+ # returns the one with the largest writing population
72
+ if len(flores_paths) == 1:
73
+ return flores_paths.values[0]
74
+ populations = [
75
+ Language.get(standardize_tag(x, macro=True)).writing_population()
76
+ for x in flores_paths.values
77
+ ]
78
+ return flores_paths.values[populations.index(max(populations))]
79
+
80
+
81
  # load benchmark languages and scripts
82
  benchmark_dir = "data/floresp-v2.0-rc.3/dev"
83
  benchmark_languages = pd.DataFrame(
84
+ [f.split(".")[1] for f in os.listdir(benchmark_dir)],
85
+ columns=["flores_path"],
86
  )
87
+ benchmark_languages["bcp_47"] = benchmark_languages["flores_path"].apply(
88
+ lambda x: standardize_tag(x, macro=True),
 
89
  )
90
  # ignore script (language is language)
91
  benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
 
93
  )
94
  benchmark_languages = (
95
  benchmark_languages.groupby("bcp_47")
96
+ .agg({"flores_path": aggregate_flores_paths})
97
  .reset_index()
98
  )
99
 
 
130
  languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
131
 
132
  languages = languages.sort_values(by="speakers", ascending=False)
133
+ languages = languages.iloc[:30]
134
 
135
  # sample languages to translate to
136
  target_languages = languages[languages["in_benchmark"]].sample(
137
  n=n_sentences, weights="speakers", replace=True, random_state=42
138
  )
139
  # sample languages to analyze with all models
140
+ detailed_languages = languages[languages["in_benchmark"]].sample(n=10, random_state=42)
141
 
142
 
143
  # utils
 
165
  return response
166
 
167
 
168
+ def load_sentences(language):
169
+ return open(f"{benchmark_dir}/dev.{language.flores_path}").readlines()
170
+
171
+
172
+ @cache
173
+ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
174
+ original_language = languages[languages["bcp_47"] == original_language_bcp_47].iloc[
175
+ 0
176
+ ]
177
+ target_language = target_languages.iloc[sentence_nr]
178
+ original_sentence = load_sentences(original_language)[sentence_nr].strip()
179
+ target_sentence = load_sentences(target_language)[sentence_nr].strip()
180
+ script = script_name(target_language.flores_path.split("_")[1])
181
  reply = await complete(
182
  model=model,
183
  messages=[
184
  {
185
  "role": "user",
186
+ "content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
187
  }
188
  ],
189
  temperature=0,
190
  max_tokens=1024,
191
  )
192
+ prediction = reply.choices[0].message.content.strip()
193
+ score = bleu.compute(
194
+ predictions=[prediction],
195
+ references=[target_sentence],
196
+ tokenizer=tokenizer.tokenize,
197
+ )
198
+ return {
199
+ "model": model,
200
+ "bcp_47": original_language["bcp_47"],
201
+ "bleu": score["bleu"],
202
+ "sentence_nr": sentence_nr,
203
+ }
204
 
205
 
206
+ def mean(lst):
207
+ return sum(lst) / len(lst) if lst else 0
 
 
208
 
209
 
210
  # evaluation!
211
  async def main():
212
+ scores = [
213
+ translate_and_evaluate(model, original_language.bcp_47, i)
214
+ for i in range(n_sentences)
215
+ for original_language in languages.itertuples()
216
+ for model in models
217
+ if original_language.in_benchmark
218
+ and (
219
+ model == fast_model
220
+ or original_language.bcp_47 in detailed_languages.bcp_47.values
221
+ )
222
+ ]
223
+ scores = await tqdm_asyncio.gather(*scores, miniters=1)
224
  results = []
225
+ for language in languages.itertuples():
226
+ results_for_language = []
227
+ for model in models:
228
+ results_for_model = [
229
+ score
230
+ for score in scores
231
+ if score["bcp_47"] == language.bcp_47 and score["model"] == model
232
+ ]
233
+ if results_for_model:
234
+ bleu = mean([s["bleu"] for s in results_for_model])
235
+ results_for_language.append(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  {
237
  "model": model,
238
+ "bleu": bleu,
 
239
  }
240
  )
241
+ if results_for_language:
242
+ results.append(
243
+ {
244
+ "language_name": language.language_name,
245
+ "bcp_47": language.bcp_47,
246
+ "speakers": language.speakers,
247
+ "scores": results_for_language,
248
+ "bleu": mean([s["bleu"] for s in results_for_language]),
249
+ "commonvoice_hours": language.commonvoice_hours,
250
+ "commonvoice_locale": language.commonvoice_locale,
251
+ }
252
+ )
253
  with open("results.json", "w") as f:
254
  json.dump(results, f, indent=2, ensure_ascii=False)
255
 
results.json CHANGED
@@ -3,49 +3,49 @@
3
  "language_name": "English",
4
  "bcp_47": "en",
5
  "speakers": 1636485840,
6
- "scores": [
7
- {
8
- "model": "meta-llama/llama-3.3-70b-instruct",
9
- "bleu": 0.4931825583688982
10
- }
11
- ],
12
- "bleu": 0.4931825583688982,
13
- "commonvoice_hours": 2649.0,
14
- "commonvoice_locale": "en"
15
- },
16
- {
17
- "language_name": "Chinese",
18
- "bcp_47": "zh",
19
- "speakers": 1304678914,
20
  "scores": [
21
  {
22
  "model": "openai/gpt-4o-mini",
23
- "bleu": 0.4807599914028467
24
  },
25
  {
26
  "model": "meta-llama/llama-3.3-70b-instruct",
27
- "bleu": 0.48224897154012053
28
  },
29
  {
30
  "model": "mistralai/mistral-small-24b-instruct-2501",
31
- "bleu": 0.2688927547323512
32
  },
33
  {
34
  "model": "google/gemini-2.0-flash-001",
35
- "bleu": 0.4876059353172742
36
  },
37
  {
38
  "model": "deepseek/deepseek-chat",
39
- "bleu": 0.46126489333496423
40
  },
41
  {
42
  "model": "microsoft/phi-4",
43
- "bleu": 0.43306718920654086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  }
45
  ],
46
- "bleu": 0.4356399559223496,
47
  "commonvoice_hours": 422.0,
48
- "commonvoice_locale": "zh-TW"
49
  },
50
  {
51
  "language_name": "Hindi",
@@ -54,12 +54,12 @@
54
  "scores": [
55
  {
56
  "model": "meta-llama/llama-3.3-70b-instruct",
57
- "bleu": 0.42910938007537924
58
  }
59
  ],
60
- "bleu": 0.42910938007537924,
61
  "commonvoice_hours": 16.0,
62
- "commonvoice_locale": "hi-IN"
63
  },
64
  {
65
  "language_name": "Spanish",
@@ -68,10 +68,10 @@
68
  "scores": [
69
  {
70
  "model": "meta-llama/llama-3.3-70b-instruct",
71
- "bleu": 0.3335615012680206
72
  }
73
  ],
74
- "bleu": 0.3335615012680206,
75
  "commonvoice_hours": 446.0,
76
  "commonvoice_locale": "es"
77
  },
@@ -82,10 +82,10 @@
82
  "scores": [
83
  {
84
  "model": "meta-llama/llama-3.3-70b-instruct",
85
- "bleu": 0.19072998559991275
86
  }
87
  ],
88
- "bleu": 0.19072998559991275,
89
  "commonvoice_hours": 91.0,
90
  "commonvoice_locale": "ar"
91
  },
@@ -94,32 +94,12 @@
94
  "bcp_47": "ur",
95
  "speakers": 290790290,
96
  "scores": [
97
- {
98
- "model": "openai/gpt-4o-mini",
99
- "bleu": 0.3223557428811336
100
- },
101
  {
102
  "model": "meta-llama/llama-3.3-70b-instruct",
103
- "bleu": 0.3361392064611452
104
- },
105
- {
106
- "model": "mistralai/mistral-small-24b-instruct-2501",
107
- "bleu": 0.30361668093990973
108
- },
109
- {
110
- "model": "google/gemini-2.0-flash-001",
111
- "bleu": 0.38811035932918286
112
- },
113
- {
114
- "model": "deepseek/deepseek-chat",
115
- "bleu": 0.33221997814253806
116
- },
117
- {
118
- "model": "microsoft/phi-4",
119
- "bleu": 0.2541447606474814
120
  }
121
  ],
122
- "bleu": 0.32276445473356513,
123
  "commonvoice_hours": 76.0,
124
  "commonvoice_locale": "ur"
125
  },
@@ -130,10 +110,10 @@
130
  "scores": [
131
  {
132
  "model": "meta-llama/llama-3.3-70b-instruct",
133
- "bleu": 0.40595466651226686
134
  }
135
  ],
136
- "bleu": 0.40595466651226686,
137
  "commonvoice_hours": 1051.0,
138
  "commonvoice_locale": "fr"
139
  },
@@ -144,10 +124,10 @@
144
  "scores": [
145
  {
146
  "model": "meta-llama/llama-3.3-70b-instruct",
147
- "bleu": 0.30570858536443696
148
  }
149
  ],
150
- "bleu": 0.30570858536443696,
151
  "commonvoice_hours": 49.0,
152
  "commonvoice_locale": "bn"
153
  },
@@ -158,30 +138,30 @@
158
  "scores": [
159
  {
160
  "model": "openai/gpt-4o-mini",
161
- "bleu": 0.4122096638493346
162
  },
163
  {
164
  "model": "meta-llama/llama-3.3-70b-instruct",
165
- "bleu": 0.39250552075952033
166
  },
167
  {
168
  "model": "mistralai/mistral-small-24b-instruct-2501",
169
- "bleu": 0.22643923104785263
170
  },
171
  {
172
  "model": "google/gemini-2.0-flash-001",
173
- "bleu": 0.42197093736929103
174
  },
175
  {
176
  "model": "deepseek/deepseek-chat",
177
- "bleu": 0.42783260235353093
178
  },
179
  {
180
  "model": "microsoft/phi-4",
181
- "bleu": 0.38611444119797594
182
  }
183
  ],
184
- "bleu": 0.3778453994295843,
185
  "commonvoice_hours": 176.0,
186
  "commonvoice_locale": "pt"
187
  },
@@ -190,13 +170,425 @@
190
  "bcp_47": "pa",
191
  "speakers": 203571210,
192
  "scores": [
 
 
 
 
193
  {
194
  "model": "meta-llama/llama-3.3-70b-instruct",
195
- "bleu": 0.34311946995454473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  }
197
  ],
198
- "bleu": 0.34311946995454473,
199
  "commonvoice_hours": 2.3,
200
  "commonvoice_locale": "pa-IN"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
  ]
 
3
  "language_name": "English",
4
  "bcp_47": "en",
5
  "speakers": 1636485840,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  "scores": [
7
  {
8
  "model": "openai/gpt-4o-mini",
9
+ "bleu": 0.5292544231540742
10
  },
11
  {
12
  "model": "meta-llama/llama-3.3-70b-instruct",
13
+ "bleu": 0.465648126623753
14
  },
15
  {
16
  "model": "mistralai/mistral-small-24b-instruct-2501",
17
+ "bleu": 0.478174166015779
18
  },
19
  {
20
  "model": "google/gemini-2.0-flash-001",
21
+ "bleu": 0.5266708610727185
22
  },
23
  {
24
  "model": "deepseek/deepseek-chat",
25
+ "bleu": 0.5549134525314846
26
  },
27
  {
28
  "model": "microsoft/phi-4",
29
+ "bleu": 0.4668163276973811
30
+ }
31
+ ],
32
+ "bleu": 0.5035795595158651,
33
+ "commonvoice_hours": 2649.0,
34
+ "commonvoice_locale": "en"
35
+ },
36
+ {
37
+ "language_name": "Chinese",
38
+ "bcp_47": "zh",
39
+ "speakers": 1304678914,
40
+ "scores": [
41
+ {
42
+ "model": "meta-llama/llama-3.3-70b-instruct",
43
+ "bleu": 0.35763875438716014
44
  }
45
  ],
46
+ "bleu": 0.35763875438716014,
47
  "commonvoice_hours": 422.0,
48
+ "commonvoice_locale": "zh-HK"
49
  },
50
  {
51
  "language_name": "Hindi",
 
54
  "scores": [
55
  {
56
  "model": "meta-llama/llama-3.3-70b-instruct",
57
+ "bleu": 0.33760351976648345
58
  }
59
  ],
60
+ "bleu": 0.33760351976648345,
61
  "commonvoice_hours": 16.0,
62
+ "commonvoice_locale": "hi"
63
  },
64
  {
65
  "language_name": "Spanish",
 
68
  "scores": [
69
  {
70
  "model": "meta-llama/llama-3.3-70b-instruct",
71
+ "bleu": 0.3600460831160618
72
  }
73
  ],
74
+ "bleu": 0.3600460831160618,
75
  "commonvoice_hours": 446.0,
76
  "commonvoice_locale": "es"
77
  },
 
82
  "scores": [
83
  {
84
  "model": "meta-llama/llama-3.3-70b-instruct",
85
+ "bleu": 0.3046598747480405
86
  }
87
  ],
88
+ "bleu": 0.3046598747480405,
89
  "commonvoice_hours": 91.0,
90
  "commonvoice_locale": "ar"
91
  },
 
94
  "bcp_47": "ur",
95
  "speakers": 290790290,
96
  "scores": [
 
 
 
 
97
  {
98
  "model": "meta-llama/llama-3.3-70b-instruct",
99
+ "bleu": 0.331647033312127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }
101
  ],
102
+ "bleu": 0.331647033312127,
103
  "commonvoice_hours": 76.0,
104
  "commonvoice_locale": "ur"
105
  },
 
110
  "scores": [
111
  {
112
  "model": "meta-llama/llama-3.3-70b-instruct",
113
+ "bleu": 0.3141809404018014
114
  }
115
  ],
116
+ "bleu": 0.3141809404018014,
117
  "commonvoice_hours": 1051.0,
118
  "commonvoice_locale": "fr"
119
  },
 
124
  "scores": [
125
  {
126
  "model": "meta-llama/llama-3.3-70b-instruct",
127
+ "bleu": 0.27472181972977344
128
  }
129
  ],
130
+ "bleu": 0.27472181972977344,
131
  "commonvoice_hours": 49.0,
132
  "commonvoice_locale": "bn"
133
  },
 
138
  "scores": [
139
  {
140
  "model": "openai/gpt-4o-mini",
141
+ "bleu": 0.36418677020025814
142
  },
143
  {
144
  "model": "meta-llama/llama-3.3-70b-instruct",
145
+ "bleu": 0.36847793827413045
146
  },
147
  {
148
  "model": "mistralai/mistral-small-24b-instruct-2501",
149
+ "bleu": 0.33146858114564615
150
  },
151
  {
152
  "model": "google/gemini-2.0-flash-001",
153
+ "bleu": 0.3685111782334586
154
  },
155
  {
156
  "model": "deepseek/deepseek-chat",
157
+ "bleu": 0.41976380092637283
158
  },
159
  {
160
  "model": "microsoft/phi-4",
161
+ "bleu": 0.35431476252948624
162
  }
163
  ],
164
+ "bleu": 0.367787171884892,
165
  "commonvoice_hours": 176.0,
166
  "commonvoice_locale": "pt"
167
  },
 
170
  "bcp_47": "pa",
171
  "speakers": 203571210,
172
  "scores": [
173
+ {
174
+ "model": "openai/gpt-4o-mini",
175
+ "bleu": 0.29479385926490154
176
+ },
177
  {
178
  "model": "meta-llama/llama-3.3-70b-instruct",
179
+ "bleu": 0.34372813238670347
180
+ },
181
+ {
182
+ "model": "mistralai/mistral-small-24b-instruct-2501",
183
+ "bleu": 0.24553184949811938
184
+ },
185
+ {
186
+ "model": "google/gemini-2.0-flash-001",
187
+ "bleu": 0.3934178960662497
188
+ },
189
+ {
190
+ "model": "deepseek/deepseek-chat",
191
+ "bleu": 0.3489400123993954
192
+ },
193
+ {
194
+ "model": "microsoft/phi-4",
195
+ "bleu": 0.26926813301032654
196
  }
197
  ],
198
+ "bleu": 0.31594664710428266,
199
  "commonvoice_hours": 2.3,
200
  "commonvoice_locale": "pa-IN"
201
+ },
202
+ {
203
+ "language_name": "Russian",
204
+ "bcp_47": "ru",
205
+ "speakers": 195841151,
206
+ "scores": [
207
+ {
208
+ "model": "meta-llama/llama-3.3-70b-instruct",
209
+ "bleu": 0.2920291935463745
210
+ }
211
+ ],
212
+ "bleu": 0.2920291935463745,
213
+ "commonvoice_hours": 241.0,
214
+ "commonvoice_locale": "ru"
215
+ },
216
+ {
217
+ "language_name": "Swahili",
218
+ "bcp_47": "sw",
219
+ "speakers": 171610296,
220
+ "scores": [
221
+ {
222
+ "model": "openai/gpt-4o-mini",
223
+ "bleu": 0.3240516590412694
224
+ },
225
+ {
226
+ "model": "meta-llama/llama-3.3-70b-instruct",
227
+ "bleu": 0.3021494866906426
228
+ },
229
+ {
230
+ "model": "mistralai/mistral-small-24b-instruct-2501",
231
+ "bleu": 0.21392015063903014
232
+ },
233
+ {
234
+ "model": "google/gemini-2.0-flash-001",
235
+ "bleu": 0.39351510575974585
236
+ },
237
+ {
238
+ "model": "deepseek/deepseek-chat",
239
+ "bleu": 0.32036034973159405
240
+ },
241
+ {
242
+ "model": "microsoft/phi-4",
243
+ "bleu": 0.2572750657835761
244
+ }
245
+ ],
246
+ "bleu": 0.3018786362743097,
247
+ "commonvoice_hours": 411.0,
248
+ "commonvoice_locale": "sw"
249
+ },
250
+ {
251
+ "language_name": "Indonesian",
252
+ "bcp_47": "id",
253
+ "speakers": 171207687,
254
+ "scores": [
255
+ {
256
+ "model": "openai/gpt-4o-mini",
257
+ "bleu": 0.31923635687963403
258
+ },
259
+ {
260
+ "model": "meta-llama/llama-3.3-70b-instruct",
261
+ "bleu": 0.32764790212460226
262
+ },
263
+ {
264
+ "model": "mistralai/mistral-small-24b-instruct-2501",
265
+ "bleu": 0.2387340248344293
266
+ },
267
+ {
268
+ "model": "google/gemini-2.0-flash-001",
269
+ "bleu": 0.36831341439353155
270
+ },
271
+ {
272
+ "model": "deepseek/deepseek-chat",
273
+ "bleu": 0.3614031163582736
274
+ },
275
+ {
276
+ "model": "microsoft/phi-4",
277
+ "bleu": 0.2526105547535859
278
+ }
279
+ ],
280
+ "bleu": 0.31132422822400946,
281
+ "commonvoice_hours": 33.0,
282
+ "commonvoice_locale": "id"
283
+ },
284
+ {
285
+ "language_name": "German",
286
+ "bcp_47": "de",
287
+ "speakers": 136350226,
288
+ "scores": [
289
+ {
290
+ "model": "openai/gpt-4o-mini",
291
+ "bleu": 0.39299196408709347
292
+ },
293
+ {
294
+ "model": "meta-llama/llama-3.3-70b-instruct",
295
+ "bleu": 0.3886659265736507
296
+ },
297
+ {
298
+ "model": "mistralai/mistral-small-24b-instruct-2501",
299
+ "bleu": 0.35731041330816654
300
+ },
301
+ {
302
+ "model": "google/gemini-2.0-flash-001",
303
+ "bleu": 0.46630655663486287
304
+ },
305
+ {
306
+ "model": "deepseek/deepseek-chat",
307
+ "bleu": 0.4373279553229372
308
+ },
309
+ {
310
+ "model": "microsoft/phi-4",
311
+ "bleu": 0.353010712972096
312
+ }
313
+ ],
314
+ "bleu": 0.3992689214831344,
315
+ "commonvoice_hours": 1357.0,
316
+ "commonvoice_locale": "de"
317
+ },
318
+ {
319
+ "language_name": "Japanese",
320
+ "bcp_47": "ja",
321
+ "speakers": 119729026,
322
+ "scores": [
323
+ {
324
+ "model": "meta-llama/llama-3.3-70b-instruct",
325
+ "bleu": 0.2954810072264808
326
+ }
327
+ ],
328
+ "bleu": 0.2954810072264808,
329
+ "commonvoice_hours": 222.0,
330
+ "commonvoice_locale": "ja"
331
+ },
332
+ {
333
+ "language_name": "Telugu",
334
+ "bcp_47": "te",
335
+ "speakers": 95478480,
336
+ "scores": [
337
+ {
338
+ "model": "meta-llama/llama-3.3-70b-instruct",
339
+ "bleu": 0.37949545228579734
340
+ }
341
+ ],
342
+ "bleu": 0.37949545228579734,
343
+ "commonvoice_hours": 0.3,
344
+ "commonvoice_locale": "te"
345
+ },
346
+ {
347
+ "language_name": "Marathi",
348
+ "bcp_47": "mr",
349
+ "speakers": 92826300,
350
+ "scores": [
351
+ {
352
+ "model": "meta-llama/llama-3.3-70b-instruct",
353
+ "bleu": 0.2852384896861461
354
+ }
355
+ ],
356
+ "bleu": 0.2852384896861461,
357
+ "commonvoice_hours": 20.0,
358
+ "commonvoice_locale": "mr"
359
+ },
360
+ {
361
+ "language_name": "Javanese",
362
+ "bcp_47": "jv",
363
+ "speakers": 91180665,
364
+ "scores": [
365
+ {
366
+ "model": "openai/gpt-4o-mini",
367
+ "bleu": 0.2755399920693052
368
+ },
369
+ {
370
+ "model": "meta-llama/llama-3.3-70b-instruct",
371
+ "bleu": 0.2494035065095152
372
+ },
373
+ {
374
+ "model": "mistralai/mistral-small-24b-instruct-2501",
375
+ "bleu": 0.1266725662438766
376
+ },
377
+ {
378
+ "model": "google/gemini-2.0-flash-001",
379
+ "bleu": 0.35614761567604236
380
+ },
381
+ {
382
+ "model": "deepseek/deepseek-chat",
383
+ "bleu": 0.29069945440951733
384
+ },
385
+ {
386
+ "model": "microsoft/phi-4",
387
+ "bleu": 0.20468330413608699
388
+ }
389
+ ],
390
+ "bleu": 0.2505244065073906,
391
+ "commonvoice_hours": 0.0,
392
+ "commonvoice_locale": "jv"
393
+ },
394
+ {
395
+ "language_name": "Vietnamese",
396
+ "bcp_47": "vi",
397
+ "speakers": 86222962,
398
+ "scores": [
399
+ {
400
+ "model": "meta-llama/llama-3.3-70b-instruct",
401
+ "bleu": 0.2956750563565745
402
+ }
403
+ ],
404
+ "bleu": 0.2956750563565745,
405
+ "commonvoice_hours": 5.9,
406
+ "commonvoice_locale": "vi"
407
+ },
408
+ {
409
+ "language_name": "Tamil",
410
+ "bcp_47": "ta",
411
+ "speakers": 85616159,
412
+ "scores": [
413
+ {
414
+ "model": "meta-llama/llama-3.3-70b-instruct",
415
+ "bleu": 0.27547489589987734
416
+ }
417
+ ],
418
+ "bleu": 0.27547489589987734,
419
+ "commonvoice_hours": 234.0,
420
+ "commonvoice_locale": "ta"
421
+ },
422
+ {
423
+ "language_name": "Persian",
424
+ "bcp_47": "fa",
425
+ "speakers": 84710459,
426
+ "scores": [
427
+ {
428
+ "model": "meta-llama/llama-3.3-70b-instruct",
429
+ "bleu": 0.2858012364771329
430
+ }
431
+ ],
432
+ "bleu": 0.2858012364771329,
433
+ "commonvoice_hours": 370.0,
434
+ "commonvoice_locale": "fa"
435
+ },
436
+ {
437
+ "language_name": "Turkish",
438
+ "bcp_47": "tr",
439
+ "speakers": 80360704,
440
+ "scores": [
441
+ {
442
+ "model": "openai/gpt-4o-mini",
443
+ "bleu": 0.32005697883543305
444
+ },
445
+ {
446
+ "model": "meta-llama/llama-3.3-70b-instruct",
447
+ "bleu": 0.3128582218784996
448
+ },
449
+ {
450
+ "model": "mistralai/mistral-small-24b-instruct-2501",
451
+ "bleu": 0.26166377989267786
452
+ },
453
+ {
454
+ "model": "google/gemini-2.0-flash-001",
455
+ "bleu": 0.3488811534537982
456
+ },
457
+ {
458
+ "model": "deepseek/deepseek-chat",
459
+ "bleu": 0.352126761953689
460
+ },
461
+ {
462
+ "model": "microsoft/phi-4",
463
+ "bleu": 0.22855630110633351
464
+ }
465
+ ],
466
+ "bleu": 0.30402386618673855,
467
+ "commonvoice_hours": 127.0,
468
+ "commonvoice_locale": "tr"
469
+ },
470
+ {
471
+ "language_name": "Cantonese",
472
+ "bcp_47": "yue",
473
+ "speakers": 79654759,
474
+ "scores": [
475
+ {
476
+ "model": "openai/gpt-4o-mini",
477
+ "bleu": 0.25523473174207373
478
+ },
479
+ {
480
+ "model": "meta-llama/llama-3.3-70b-instruct",
481
+ "bleu": 0.2901127503841879
482
+ },
483
+ {
484
+ "model": "mistralai/mistral-small-24b-instruct-2501",
485
+ "bleu": 0.23880603698191288
486
+ },
487
+ {
488
+ "model": "google/gemini-2.0-flash-001",
489
+ "bleu": 0.33330775674699475
490
+ },
491
+ {
492
+ "model": "deepseek/deepseek-chat",
493
+ "bleu": 0.30942219437451896
494
+ },
495
+ {
496
+ "model": "microsoft/phi-4",
497
+ "bleu": 0.25167599008414604
498
+ }
499
+ ],
500
+ "bleu": 0.27975991005230577,
501
+ "commonvoice_hours": 203.0,
502
+ "commonvoice_locale": "yue"
503
+ },
504
+ {
505
+ "language_name": "Korean",
506
+ "bcp_47": "ko",
507
+ "speakers": 78357046,
508
+ "scores": [
509
+ {
510
+ "model": "meta-llama/llama-3.3-70b-instruct",
511
+ "bleu": 0.24501349273295708
512
+ }
513
+ ],
514
+ "bleu": 0.24501349273295708,
515
+ "commonvoice_hours": 1.7,
516
+ "commonvoice_locale": "ko"
517
+ },
518
+ {
519
+ "language_name": "Italian",
520
+ "bcp_47": "it",
521
+ "speakers": 70247060,
522
+ "scores": [
523
+ {
524
+ "model": "meta-llama/llama-3.3-70b-instruct",
525
+ "bleu": 0.3273249067267197
526
+ }
527
+ ],
528
+ "bleu": 0.3273249067267197,
529
+ "commonvoice_hours": 362.0,
530
+ "commonvoice_locale": "it"
531
+ },
532
+ {
533
+ "language_name": "Filipino",
534
+ "bcp_47": "fil",
535
+ "speakers": 67471096,
536
+ "scores": [
537
+ {
538
+ "model": "openai/gpt-4o-mini",
539
+ "bleu": 0.35950288667055635
540
+ },
541
+ {
542
+ "model": "meta-llama/llama-3.3-70b-instruct",
543
+ "bleu": 0.3458571802193247
544
+ },
545
+ {
546
+ "model": "mistralai/mistral-small-24b-instruct-2501",
547
+ "bleu": 0.2769096553598123
548
+ },
549
+ {
550
+ "model": "google/gemini-2.0-flash-001",
551
+ "bleu": 0.4030081046637165
552
+ },
553
+ {
554
+ "model": "deepseek/deepseek-chat",
555
+ "bleu": 0.3712699611966998
556
+ },
557
+ {
558
+ "model": "microsoft/phi-4",
559
+ "bleu": 0.25550756070033753
560
+ }
561
+ ],
562
+ "bleu": 0.3353425581350746,
563
+ "commonvoice_hours": 0.0,
564
+ "commonvoice_locale": "tl"
565
+ },
566
+ {
567
+ "language_name": "Egyptian Arabic",
568
+ "bcp_47": "arz",
569
+ "speakers": 66639360,
570
+ "scores": [
571
+ {
572
+ "model": "meta-llama/llama-3.3-70b-instruct",
573
+ "bleu": 0.23431638822117362
574
+ }
575
+ ],
576
+ "bleu": 0.23431638822117362,
577
+ "commonvoice_hours": NaN,
578
+ "commonvoice_locale": NaN
579
+ },
580
+ {
581
+ "language_name": "Gujarati",
582
+ "bcp_47": "gu",
583
+ "speakers": 61721799,
584
+ "scores": [
585
+ {
586
+ "model": "meta-llama/llama-3.3-70b-instruct",
587
+ "bleu": 0.27834507803114356
588
+ }
589
+ ],
590
+ "bleu": 0.27834507803114356,
591
+ "commonvoice_hours": 0.0,
592
+ "commonvoice_locale": "gu-IN"
593
  }
594
  ]