David Pomerenke
commited on
Commit
·
175993f
1
Parent(s):
63202a2
Newer models, run on 20 languages
Browse files- evals.py +11 -9
- results.json +285 -21
evals.py
CHANGED
@@ -15,14 +15,16 @@ from transformers import NllbTokenizer
|
|
15 |
|
16 |
# config
|
17 |
models = [
|
18 |
-
"openai/gpt-4o-mini",
|
19 |
-
"anthropic/claude-3.5-haiku",
|
20 |
-
|
21 |
-
|
22 |
-
"google/gemini-flash-
|
23 |
-
# "qwen/qwen-
|
|
|
|
|
24 |
]
|
25 |
-
fast_model = "
|
26 |
n_sentences = 30
|
27 |
|
28 |
# setup
|
@@ -93,7 +95,7 @@ languages = pd.merge(benchmark_languages, languages, on="language_code", how="ou
|
|
93 |
languages = pd.merge(languages, script_names, on="script_code", how="left")
|
94 |
languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
|
95 |
languages = languages.sort_values(by="speakers", ascending=False)
|
96 |
-
languages = languages.iloc[:
|
97 |
|
98 |
# sample languages to translate to
|
99 |
target_languages_NEW = languages[languages["in_benchmark"]].sample(
|
@@ -101,7 +103,7 @@ target_languages_NEW = languages[languages["in_benchmark"]].sample(
|
|
101 |
)
|
102 |
# sample languages to analyze with all models
|
103 |
detailed_languages = languages[languages["in_benchmark"]].sample(
|
104 |
-
n=
|
105 |
)
|
106 |
|
107 |
|
|
|
15 |
|
16 |
# config
|
17 |
models = [
|
18 |
+
"openai/gpt-4o-mini", # 0.6$/M tokens
|
19 |
+
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
|
20 |
+
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
21 |
+
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
22 |
+
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
23 |
+
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
24 |
+
"deepseek/deepseek-chat", # 0.9$/M tokens
|
25 |
+
"microsoft/phi-4", # 0.07$/M tokens
|
26 |
]
|
27 |
+
fast_model = "meta-llama/llama-3.3-70b-instruct"
|
28 |
n_sentences = 30
|
29 |
|
30 |
# setup
|
|
|
95 |
languages = pd.merge(languages, script_names, on="script_code", how="left")
|
96 |
languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
|
97 |
languages = languages.sort_values(by="speakers", ascending=False)
|
98 |
+
languages = languages.iloc[:20]
|
99 |
|
100 |
# sample languages to translate to
|
101 |
target_languages_NEW = languages[languages["in_benchmark"]].sample(
|
|
|
103 |
)
|
104 |
# sample languages to analyze with all models
|
105 |
detailed_languages = languages[languages["in_benchmark"]].sample(
|
106 |
+
n=5, random_state=42
|
107 |
)
|
108 |
|
109 |
|
results.json
CHANGED
@@ -5,11 +5,31 @@
|
|
5 |
"speakers": 1132366680.0,
|
6 |
"scores": [
|
7 |
{
|
8 |
-
"model": "
|
9 |
-
"bleu": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
}
|
11 |
],
|
12 |
-
"bleu": 0.
|
13 |
},
|
14 |
{
|
15 |
"language_name": "Mandarin Chinese",
|
@@ -18,18 +38,30 @@
|
|
18 |
"scores": [
|
19 |
{
|
20 |
"model": "openai/gpt-4o-mini",
|
21 |
-
"bleu": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
},
|
23 |
{
|
24 |
-
"model": "
|
25 |
-
"bleu": 0.
|
26 |
},
|
27 |
{
|
28 |
-
"model": "
|
29 |
-
"bleu": 0.
|
30 |
}
|
31 |
],
|
32 |
-
"bleu": 0.
|
33 |
},
|
34 |
{
|
35 |
"language_name": "Spanish",
|
@@ -37,11 +69,11 @@
|
|
37 |
"speakers": 485000000.0,
|
38 |
"scores": [
|
39 |
{
|
40 |
-
"model": "
|
41 |
-
"bleu": 0.
|
42 |
}
|
43 |
],
|
44 |
-
"bleu": 0.
|
45 |
},
|
46 |
{
|
47 |
"language_name": "Hindi",
|
@@ -49,30 +81,262 @@
|
|
49 |
"speakers": 341000000.0,
|
50 |
"scores": [
|
51 |
{
|
52 |
-
"model": "
|
53 |
-
"bleu": 0.
|
54 |
}
|
55 |
],
|
56 |
-
"bleu": 0.
|
57 |
},
|
58 |
{
|
59 |
"language_name": "Bengali",
|
60 |
"language_code": "ben",
|
61 |
"speakers": 300000000.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
"scores": [
|
63 |
{
|
64 |
"model": "openai/gpt-4o-mini",
|
65 |
-
"bleu": 0.
|
|
|
|
|
|
|
|
|
66 |
},
|
67 |
{
|
68 |
-
"model": "
|
69 |
-
"bleu": 0.
|
70 |
},
|
71 |
{
|
72 |
-
"model": "google/gemini-flash-
|
73 |
-
"bleu": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
}
|
75 |
],
|
76 |
-
"bleu": 0.
|
77 |
}
|
78 |
]
|
|
|
5 |
"speakers": 1132366680.0,
|
6 |
"scores": [
|
7 |
{
|
8 |
+
"model": "openai/gpt-4o-mini",
|
9 |
+
"bleu": 0.5103385437635193
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
13 |
+
"bleu": 0.4845283039311465
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
17 |
+
"bleu": 0.4735424836788773
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"model": "google/gemini-2.0-flash-001",
|
21 |
+
"bleu": 0.5639490578152662
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"model": "deepseek/deepseek-chat",
|
25 |
+
"bleu": 0.5547524505965893
|
26 |
+
},
|
27 |
+
{
|
28 |
+
"model": "microsoft/phi-4",
|
29 |
+
"bleu": 0.48008677312779885
|
30 |
}
|
31 |
],
|
32 |
+
"bleu": 0.5111996021521995
|
33 |
},
|
34 |
{
|
35 |
"language_name": "Mandarin Chinese",
|
|
|
38 |
"scores": [
|
39 |
{
|
40 |
"model": "openai/gpt-4o-mini",
|
41 |
+
"bleu": 0.38427885971806375
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
45 |
+
"bleu": 0.4309762560114817
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
49 |
+
"bleu": 0.40933363203497697
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"model": "google/gemini-2.0-flash-001",
|
53 |
+
"bleu": 0.4486368724887284
|
54 |
},
|
55 |
{
|
56 |
+
"model": "deepseek/deepseek-chat",
|
57 |
+
"bleu": 0.4354691779014211
|
58 |
},
|
59 |
{
|
60 |
+
"model": "microsoft/phi-4",
|
61 |
+
"bleu": 0.3597312915524714
|
62 |
}
|
63 |
],
|
64 |
+
"bleu": 0.41140434828452394
|
65 |
},
|
66 |
{
|
67 |
"language_name": "Spanish",
|
|
|
69 |
"speakers": 485000000.0,
|
70 |
"scores": [
|
71 |
{
|
72 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
73 |
+
"bleu": 0.41303609006378467
|
74 |
}
|
75 |
],
|
76 |
+
"bleu": 0.41303609006378467
|
77 |
},
|
78 |
{
|
79 |
"language_name": "Hindi",
|
|
|
81 |
"speakers": 341000000.0,
|
82 |
"scores": [
|
83 |
{
|
84 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
85 |
+
"bleu": 0.39051313583666847
|
86 |
}
|
87 |
],
|
88 |
+
"bleu": 0.39051313583666847
|
89 |
},
|
90 |
{
|
91 |
"language_name": "Bengali",
|
92 |
"language_code": "ben",
|
93 |
"speakers": 300000000.0,
|
94 |
+
"scores": [
|
95 |
+
{
|
96 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
97 |
+
"bleu": 0.3922760582029
|
98 |
+
}
|
99 |
+
],
|
100 |
+
"bleu": 0.3922760582029
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"language_name": "Portuguese",
|
104 |
+
"language_code": "por",
|
105 |
+
"speakers": 254300000.0,
|
106 |
+
"scores": [
|
107 |
+
{
|
108 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
109 |
+
"bleu": 0.3569933404494365
|
110 |
+
}
|
111 |
+
],
|
112 |
+
"bleu": 0.3569933404494365
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"language_name": "French",
|
116 |
+
"language_code": "fra",
|
117 |
+
"speakers": 208157220.0,
|
118 |
+
"scores": [
|
119 |
+
{
|
120 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
121 |
+
"bleu": 0.4092873981445945
|
122 |
+
}
|
123 |
+
],
|
124 |
+
"bleu": 0.4092873981445945
|
125 |
+
},
|
126 |
+
{
|
127 |
+
"language_name": "Indonesian",
|
128 |
+
"language_code": "ind",
|
129 |
+
"speakers": 198996550.0,
|
130 |
+
"scores": [
|
131 |
+
{
|
132 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
133 |
+
"bleu": 0.3671689105193036
|
134 |
+
}
|
135 |
+
],
|
136 |
+
"bleu": 0.3671689105193036
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"language_name": "Russian",
|
140 |
+
"language_code": "rus",
|
141 |
+
"speakers": 171428900.0,
|
142 |
"scores": [
|
143 |
{
|
144 |
"model": "openai/gpt-4o-mini",
|
145 |
+
"bleu": 0.3821837153890323
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
149 |
+
"bleu": 0.3974431757931015
|
150 |
},
|
151 |
{
|
152 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
153 |
+
"bleu": 0.2541840010941474
|
154 |
},
|
155 |
{
|
156 |
+
"model": "google/gemini-2.0-flash-001",
|
157 |
+
"bleu": 0.43388586741780116
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"model": "deepseek/deepseek-chat",
|
161 |
+
"bleu": 0.4148930468752925
|
162 |
+
},
|
163 |
+
{
|
164 |
+
"model": "microsoft/phi-4",
|
165 |
+
"bleu": 0.3530948239011605
|
166 |
+
}
|
167 |
+
],
|
168 |
+
"bleu": 0.3726141050784226
|
169 |
+
},
|
170 |
+
{
|
171 |
+
"language_name": "Japanese",
|
172 |
+
"language_code": "jpn",
|
173 |
+
"speakers": 128000000.0,
|
174 |
+
"scores": [
|
175 |
+
{
|
176 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
177 |
+
"bleu": 0.294012705268792
|
178 |
+
}
|
179 |
+
],
|
180 |
+
"bleu": 0.294012705268792
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"language_name": "Eastern Punjabi",
|
184 |
+
"language_code": "pan",
|
185 |
+
"speakers": 125000000.0,
|
186 |
+
"scores": [
|
187 |
+
{
|
188 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
189 |
+
"bleu": 0.37715805829458243
|
190 |
+
}
|
191 |
+
],
|
192 |
+
"bleu": 0.37715805829458243
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"language_name": "Standard German",
|
196 |
+
"language_code": "deu",
|
197 |
+
"speakers": 105000000.0,
|
198 |
+
"scores": [
|
199 |
+
{
|
200 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
201 |
+
"bleu": 0.39190456406769925
|
202 |
+
}
|
203 |
+
],
|
204 |
+
"bleu": 0.39190456406769925
|
205 |
+
},
|
206 |
+
{
|
207 |
+
"language_name": "Egyptian Arabic",
|
208 |
+
"language_code": "arz",
|
209 |
+
"speakers": 100542400.0,
|
210 |
+
"scores": [
|
211 |
+
{
|
212 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
213 |
+
"bleu": 0.2769739921069721
|
214 |
+
}
|
215 |
+
],
|
216 |
+
"bleu": 0.2769739921069721
|
217 |
+
},
|
218 |
+
{
|
219 |
+
"language_name": "Urdu",
|
220 |
+
"language_code": "urd",
|
221 |
+
"speakers": 94022900.0,
|
222 |
+
"scores": [
|
223 |
+
{
|
224 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
225 |
+
"bleu": 0.30532627541695706
|
226 |
+
}
|
227 |
+
],
|
228 |
+
"bleu": 0.30532627541695706
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"language_name": "Filipino",
|
232 |
+
"language_code": "fil",
|
233 |
+
"speakers": 90000000.0,
|
234 |
+
"scores": [
|
235 |
+
{
|
236 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
237 |
+
"bleu": 0.38380780370055084
|
238 |
+
}
|
239 |
+
],
|
240 |
+
"bleu": 0.38380780370055084
|
241 |
+
},
|
242 |
+
{
|
243 |
+
"language_name": "Javanese",
|
244 |
+
"language_code": "jav",
|
245 |
+
"speakers": 84308740.0,
|
246 |
+
"scores": [
|
247 |
+
{
|
248 |
+
"model": "openai/gpt-4o-mini",
|
249 |
+
"bleu": 0.303156768433342
|
250 |
+
},
|
251 |
+
{
|
252 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
253 |
+
"bleu": 0.3147001751424492
|
254 |
+
},
|
255 |
+
{
|
256 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
257 |
+
"bleu": 0.1507764424388819
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"model": "google/gemini-2.0-flash-001",
|
261 |
+
"bleu": 0.41409824694226155
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"model": "deepseek/deepseek-chat",
|
265 |
+
"bleu": 0.3240536705195471
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"model": "microsoft/phi-4",
|
269 |
+
"bleu": 0.22770614610795217
|
270 |
+
}
|
271 |
+
],
|
272 |
+
"bleu": 0.2890819082640723
|
273 |
+
},
|
274 |
+
{
|
275 |
+
"language_name": "Marathi",
|
276 |
+
"language_code": "mar",
|
277 |
+
"speakers": 83100000.0,
|
278 |
+
"scores": [
|
279 |
+
{
|
280 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
281 |
+
"bleu": 0.3754377211201414
|
282 |
+
}
|
283 |
+
],
|
284 |
+
"bleu": 0.3754377211201414
|
285 |
+
},
|
286 |
+
{
|
287 |
+
"language_name": "Swahili",
|
288 |
+
"language_code": "swh",
|
289 |
+
"speakers": 82300000.0,
|
290 |
+
"scores": [
|
291 |
+
{
|
292 |
+
"model": "openai/gpt-4o-mini",
|
293 |
+
"bleu": 0.3698648558947496
|
294 |
+
},
|
295 |
+
{
|
296 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
297 |
+
"bleu": 0.31914577240036923
|
298 |
+
},
|
299 |
+
{
|
300 |
+
"model": "mistralai/mistral-small-24b-instruct-2501",
|
301 |
+
"bleu": 0.16066681130875948
|
302 |
+
},
|
303 |
+
{
|
304 |
+
"model": "google/gemini-2.0-flash-001",
|
305 |
+
"bleu": 0.3934769032884265
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"model": "deepseek/deepseek-chat",
|
309 |
+
"bleu": 0.3605623890073268
|
310 |
+
},
|
311 |
+
{
|
312 |
+
"model": "microsoft/phi-4",
|
313 |
+
"bleu": 0.175030478984087
|
314 |
+
}
|
315 |
+
],
|
316 |
+
"bleu": 0.2964578684806198
|
317 |
+
},
|
318 |
+
{
|
319 |
+
"language_name": "Turkish",
|
320 |
+
"language_code": "tur",
|
321 |
+
"speakers": 82231620.0,
|
322 |
+
"scores": [
|
323 |
+
{
|
324 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
325 |
+
"bleu": 0.37080958221553817
|
326 |
+
}
|
327 |
+
],
|
328 |
+
"bleu": 0.37080958221553817
|
329 |
+
},
|
330 |
+
{
|
331 |
+
"language_name": "Telugu",
|
332 |
+
"language_code": "tel",
|
333 |
+
"speakers": 82000000.0,
|
334 |
+
"scores": [
|
335 |
+
{
|
336 |
+
"model": "meta-llama/llama-3.3-70b-instruct",
|
337 |
+
"bleu": 0.35400532981470717
|
338 |
}
|
339 |
],
|
340 |
+
"bleu": 0.35400532981470717
|
341 |
}
|
342 |
]
|