David Pomerenke commited on
Commit
175993f
·
1 Parent(s): 63202a2

Newer models, run on 20 languages

Browse files
Files changed (2) hide show
  1. evals.py +11 -9
  2. results.json +285 -21
evals.py CHANGED
@@ -15,14 +15,16 @@ from transformers import NllbTokenizer
15
 
16
  # config
17
  models = [
18
- "openai/gpt-4o-mini",
19
- "anthropic/claude-3.5-haiku",
20
- # "meta-llama/llama-3.1-405b-instruct", # lots of slow repetitions for LRLs
21
- # "mistralai/mistral-large",
22
- "google/gemini-flash-1.5", # very fast
23
- # "qwen/qwen-2.5-72b-instruct", # somewhat slow
 
 
24
  ]
25
- fast_model = "anthropic/claude-3.5-haiku"
26
  n_sentences = 30
27
 
28
  # setup
@@ -93,7 +95,7 @@ languages = pd.merge(benchmark_languages, languages, on="language_code", how="ou
93
  languages = pd.merge(languages, script_names, on="script_code", how="left")
94
  languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
95
  languages = languages.sort_values(by="speakers", ascending=False)
96
- languages = languages.iloc[:5]
97
 
98
  # sample languages to translate to
99
  target_languages_NEW = languages[languages["in_benchmark"]].sample(
@@ -101,7 +103,7 @@ target_languages_NEW = languages[languages["in_benchmark"]].sample(
101
  )
102
  # sample languages to analyze with all models
103
  detailed_languages = languages[languages["in_benchmark"]].sample(
104
- n=2, random_state=42
105
  )
106
 
107
 
 
15
 
16
  # config
17
  models = [
18
+ "openai/gpt-4o-mini", # 0.6$/M tokens
19
+ # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
20
+ "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
21
+ "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
22
+ "google/gemini-2.0-flash-001", # 0.4$/M tokens
23
+ # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
24
+ "deepseek/deepseek-chat", # 0.9$/M tokens
25
+ "microsoft/phi-4", # 0.07$/M tokens
26
  ]
27
+ fast_model = "meta-llama/llama-3.3-70b-instruct"
28
  n_sentences = 30
29
 
30
  # setup
 
95
  languages = pd.merge(languages, script_names, on="script_code", how="left")
96
  languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
97
  languages = languages.sort_values(by="speakers", ascending=False)
98
+ languages = languages.iloc[:20]
99
 
100
  # sample languages to translate to
101
  target_languages_NEW = languages[languages["in_benchmark"]].sample(
 
103
  )
104
  # sample languages to analyze with all models
105
  detailed_languages = languages[languages["in_benchmark"]].sample(
106
+ n=5, random_state=42
107
  )
108
 
109
 
results.json CHANGED
@@ -5,11 +5,31 @@
5
  "speakers": 1132366680.0,
6
  "scores": [
7
  {
8
- "model": "anthropic/claude-3.5-haiku",
9
- "bleu": 0.5911781645744415
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  }
11
  ],
12
- "bleu": 0.5911781645744415
13
  },
14
  {
15
  "language_name": "Mandarin Chinese",
@@ -18,18 +38,30 @@
18
  "scores": [
19
  {
20
  "model": "openai/gpt-4o-mini",
21
- "bleu": 0.5423652619241204
 
 
 
 
 
 
 
 
 
 
 
 
22
  },
23
  {
24
- "model": "anthropic/claude-3.5-haiku",
25
- "bleu": 0.4734856962747124
26
  },
27
  {
28
- "model": "google/gemini-flash-1.5",
29
- "bleu": 0.430570062499388
30
  }
31
  ],
32
- "bleu": 0.4821403402327402
33
  },
34
  {
35
  "language_name": "Spanish",
@@ -37,11 +69,11 @@
37
  "speakers": 485000000.0,
38
  "scores": [
39
  {
40
- "model": "anthropic/claude-3.5-haiku",
41
- "bleu": 0.4131404308980914
42
  }
43
  ],
44
- "bleu": 0.4131404308980914
45
  },
46
  {
47
  "language_name": "Hindi",
@@ -49,30 +81,262 @@
49
  "speakers": 341000000.0,
50
  "scores": [
51
  {
52
- "model": "anthropic/claude-3.5-haiku",
53
- "bleu": 0.3710125447937959
54
  }
55
  ],
56
- "bleu": 0.3710125447937959
57
  },
58
  {
59
  "language_name": "Bengali",
60
  "language_code": "ben",
61
  "speakers": 300000000.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  "scores": [
63
  {
64
  "model": "openai/gpt-4o-mini",
65
- "bleu": 0.40080430939726097
 
 
 
 
66
  },
67
  {
68
- "model": "anthropic/claude-3.5-haiku",
69
- "bleu": 0.3733558186182232
70
  },
71
  {
72
- "model": "google/gemini-flash-1.5",
73
- "bleu": 0.4337794805645439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  }
75
  ],
76
- "bleu": 0.4026465361933427
77
  }
78
  ]
 
5
  "speakers": 1132366680.0,
6
  "scores": [
7
  {
8
+ "model": "openai/gpt-4o-mini",
9
+ "bleu": 0.5103385437635193
10
+ },
11
+ {
12
+ "model": "meta-llama/llama-3.3-70b-instruct",
13
+ "bleu": 0.4845283039311465
14
+ },
15
+ {
16
+ "model": "mistralai/mistral-small-24b-instruct-2501",
17
+ "bleu": 0.4735424836788773
18
+ },
19
+ {
20
+ "model": "google/gemini-2.0-flash-001",
21
+ "bleu": 0.5639490578152662
22
+ },
23
+ {
24
+ "model": "deepseek/deepseek-chat",
25
+ "bleu": 0.5547524505965893
26
+ },
27
+ {
28
+ "model": "microsoft/phi-4",
29
+ "bleu": 0.48008677312779885
30
  }
31
  ],
32
+ "bleu": 0.5111996021521995
33
  },
34
  {
35
  "language_name": "Mandarin Chinese",
 
38
  "scores": [
39
  {
40
  "model": "openai/gpt-4o-mini",
41
+ "bleu": 0.38427885971806375
42
+ },
43
+ {
44
+ "model": "meta-llama/llama-3.3-70b-instruct",
45
+ "bleu": 0.4309762560114817
46
+ },
47
+ {
48
+ "model": "mistralai/mistral-small-24b-instruct-2501",
49
+ "bleu": 0.40933363203497697
50
+ },
51
+ {
52
+ "model": "google/gemini-2.0-flash-001",
53
+ "bleu": 0.4486368724887284
54
  },
55
  {
56
+ "model": "deepseek/deepseek-chat",
57
+ "bleu": 0.4354691779014211
58
  },
59
  {
60
+ "model": "microsoft/phi-4",
61
+ "bleu": 0.3597312915524714
62
  }
63
  ],
64
+ "bleu": 0.41140434828452394
65
  },
66
  {
67
  "language_name": "Spanish",
 
69
  "speakers": 485000000.0,
70
  "scores": [
71
  {
72
+ "model": "meta-llama/llama-3.3-70b-instruct",
73
+ "bleu": 0.41303609006378467
74
  }
75
  ],
76
+ "bleu": 0.41303609006378467
77
  },
78
  {
79
  "language_name": "Hindi",
 
81
  "speakers": 341000000.0,
82
  "scores": [
83
  {
84
+ "model": "meta-llama/llama-3.3-70b-instruct",
85
+ "bleu": 0.39051313583666847
86
  }
87
  ],
88
+ "bleu": 0.39051313583666847
89
  },
90
  {
91
  "language_name": "Bengali",
92
  "language_code": "ben",
93
  "speakers": 300000000.0,
94
+ "scores": [
95
+ {
96
+ "model": "meta-llama/llama-3.3-70b-instruct",
97
+ "bleu": 0.3922760582029
98
+ }
99
+ ],
100
+ "bleu": 0.3922760582029
101
+ },
102
+ {
103
+ "language_name": "Portuguese",
104
+ "language_code": "por",
105
+ "speakers": 254300000.0,
106
+ "scores": [
107
+ {
108
+ "model": "meta-llama/llama-3.3-70b-instruct",
109
+ "bleu": 0.3569933404494365
110
+ }
111
+ ],
112
+ "bleu": 0.3569933404494365
113
+ },
114
+ {
115
+ "language_name": "French",
116
+ "language_code": "fra",
117
+ "speakers": 208157220.0,
118
+ "scores": [
119
+ {
120
+ "model": "meta-llama/llama-3.3-70b-instruct",
121
+ "bleu": 0.4092873981445945
122
+ }
123
+ ],
124
+ "bleu": 0.4092873981445945
125
+ },
126
+ {
127
+ "language_name": "Indonesian",
128
+ "language_code": "ind",
129
+ "speakers": 198996550.0,
130
+ "scores": [
131
+ {
132
+ "model": "meta-llama/llama-3.3-70b-instruct",
133
+ "bleu": 0.3671689105193036
134
+ }
135
+ ],
136
+ "bleu": 0.3671689105193036
137
+ },
138
+ {
139
+ "language_name": "Russian",
140
+ "language_code": "rus",
141
+ "speakers": 171428900.0,
142
  "scores": [
143
  {
144
  "model": "openai/gpt-4o-mini",
145
+ "bleu": 0.3821837153890323
146
+ },
147
+ {
148
+ "model": "meta-llama/llama-3.3-70b-instruct",
149
+ "bleu": 0.3974431757931015
150
  },
151
  {
152
+ "model": "mistralai/mistral-small-24b-instruct-2501",
153
+ "bleu": 0.2541840010941474
154
  },
155
  {
156
+ "model": "google/gemini-2.0-flash-001",
157
+ "bleu": 0.43388586741780116
158
+ },
159
+ {
160
+ "model": "deepseek/deepseek-chat",
161
+ "bleu": 0.4148930468752925
162
+ },
163
+ {
164
+ "model": "microsoft/phi-4",
165
+ "bleu": 0.3530948239011605
166
+ }
167
+ ],
168
+ "bleu": 0.3726141050784226
169
+ },
170
+ {
171
+ "language_name": "Japanese",
172
+ "language_code": "jpn",
173
+ "speakers": 128000000.0,
174
+ "scores": [
175
+ {
176
+ "model": "meta-llama/llama-3.3-70b-instruct",
177
+ "bleu": 0.294012705268792
178
+ }
179
+ ],
180
+ "bleu": 0.294012705268792
181
+ },
182
+ {
183
+ "language_name": "Eastern Punjabi",
184
+ "language_code": "pan",
185
+ "speakers": 125000000.0,
186
+ "scores": [
187
+ {
188
+ "model": "meta-llama/llama-3.3-70b-instruct",
189
+ "bleu": 0.37715805829458243
190
+ }
191
+ ],
192
+ "bleu": 0.37715805829458243
193
+ },
194
+ {
195
+ "language_name": "Standard German",
196
+ "language_code": "deu",
197
+ "speakers": 105000000.0,
198
+ "scores": [
199
+ {
200
+ "model": "meta-llama/llama-3.3-70b-instruct",
201
+ "bleu": 0.39190456406769925
202
+ }
203
+ ],
204
+ "bleu": 0.39190456406769925
205
+ },
206
+ {
207
+ "language_name": "Egyptian Arabic",
208
+ "language_code": "arz",
209
+ "speakers": 100542400.0,
210
+ "scores": [
211
+ {
212
+ "model": "meta-llama/llama-3.3-70b-instruct",
213
+ "bleu": 0.2769739921069721
214
+ }
215
+ ],
216
+ "bleu": 0.2769739921069721
217
+ },
218
+ {
219
+ "language_name": "Urdu",
220
+ "language_code": "urd",
221
+ "speakers": 94022900.0,
222
+ "scores": [
223
+ {
224
+ "model": "meta-llama/llama-3.3-70b-instruct",
225
+ "bleu": 0.30532627541695706
226
+ }
227
+ ],
228
+ "bleu": 0.30532627541695706
229
+ },
230
+ {
231
+ "language_name": "Filipino",
232
+ "language_code": "fil",
233
+ "speakers": 90000000.0,
234
+ "scores": [
235
+ {
236
+ "model": "meta-llama/llama-3.3-70b-instruct",
237
+ "bleu": 0.38380780370055084
238
+ }
239
+ ],
240
+ "bleu": 0.38380780370055084
241
+ },
242
+ {
243
+ "language_name": "Javanese",
244
+ "language_code": "jav",
245
+ "speakers": 84308740.0,
246
+ "scores": [
247
+ {
248
+ "model": "openai/gpt-4o-mini",
249
+ "bleu": 0.303156768433342
250
+ },
251
+ {
252
+ "model": "meta-llama/llama-3.3-70b-instruct",
253
+ "bleu": 0.3147001751424492
254
+ },
255
+ {
256
+ "model": "mistralai/mistral-small-24b-instruct-2501",
257
+ "bleu": 0.1507764424388819
258
+ },
259
+ {
260
+ "model": "google/gemini-2.0-flash-001",
261
+ "bleu": 0.41409824694226155
262
+ },
263
+ {
264
+ "model": "deepseek/deepseek-chat",
265
+ "bleu": 0.3240536705195471
266
+ },
267
+ {
268
+ "model": "microsoft/phi-4",
269
+ "bleu": 0.22770614610795217
270
+ }
271
+ ],
272
+ "bleu": 0.2890819082640723
273
+ },
274
+ {
275
+ "language_name": "Marathi",
276
+ "language_code": "mar",
277
+ "speakers": 83100000.0,
278
+ "scores": [
279
+ {
280
+ "model": "meta-llama/llama-3.3-70b-instruct",
281
+ "bleu": 0.3754377211201414
282
+ }
283
+ ],
284
+ "bleu": 0.3754377211201414
285
+ },
286
+ {
287
+ "language_name": "Swahili",
288
+ "language_code": "swh",
289
+ "speakers": 82300000.0,
290
+ "scores": [
291
+ {
292
+ "model": "openai/gpt-4o-mini",
293
+ "bleu": 0.3698648558947496
294
+ },
295
+ {
296
+ "model": "meta-llama/llama-3.3-70b-instruct",
297
+ "bleu": 0.31914577240036923
298
+ },
299
+ {
300
+ "model": "mistralai/mistral-small-24b-instruct-2501",
301
+ "bleu": 0.16066681130875948
302
+ },
303
+ {
304
+ "model": "google/gemini-2.0-flash-001",
305
+ "bleu": 0.3934769032884265
306
+ },
307
+ {
308
+ "model": "deepseek/deepseek-chat",
309
+ "bleu": 0.3605623890073268
310
+ },
311
+ {
312
+ "model": "microsoft/phi-4",
313
+ "bleu": 0.175030478984087
314
+ }
315
+ ],
316
+ "bleu": 0.2964578684806198
317
+ },
318
+ {
319
+ "language_name": "Turkish",
320
+ "language_code": "tur",
321
+ "speakers": 82231620.0,
322
+ "scores": [
323
+ {
324
+ "model": "meta-llama/llama-3.3-70b-instruct",
325
+ "bleu": 0.37080958221553817
326
+ }
327
+ ],
328
+ "bleu": 0.37080958221553817
329
+ },
330
+ {
331
+ "language_name": "Telugu",
332
+ "language_code": "tel",
333
+ "speakers": 82000000.0,
334
+ "scores": [
335
+ {
336
+ "model": "meta-llama/llama-3.3-70b-instruct",
337
+ "bleu": 0.35400532981470717
338
  }
339
  ],
340
+ "bleu": 0.35400532981470717
341
  }
342
  ]