David Pomerenke commited on
Commit
a65282b
·
1 Parent(s): d597fe1

Nice tables and plots

Browse files
Files changed (3) hide show
  1. app.py +124 -41
  2. evals.py +3 -3
  3. results.json +302 -92
app.py CHANGED
@@ -2,12 +2,12 @@ import gradio as gr
2
  import json
3
  import pandas as pd
4
  import plotly.graph_objects as go
5
- from plotly.subplots import make_subplots
6
 
7
  # Load and process results
8
  with open("results.json") as f:
9
  results = json.load(f)
10
 
 
11
  def create_model_comparison_plot(results):
12
  # Extract all unique models
13
  models = set()
@@ -15,96 +15,179 @@ def create_model_comparison_plot(results):
15
  for score in lang["scores"]:
16
  models.add(score["model"])
17
  models = list(models)
18
-
19
  # Create traces for each model
20
  traces = []
21
  for model in models:
22
  x_vals = [] # languages
23
  y_vals = [] # BLEU scores
24
-
25
  for lang in results:
26
- model_score = next((s["bleu"] for s in lang["scores"] if s["model"] == model), None)
 
 
27
  if model_score is not None:
28
  x_vals.append(lang["language_name"])
29
  y_vals.append(model_score)
30
-
31
- traces.append(go.Bar(
32
- name=model.split('/')[-1],
33
- x=x_vals,
34
- y=y_vals,
35
- ))
36
-
 
 
37
  fig = go.Figure(data=traces)
38
  fig.update_layout(
39
  title="BLEU Scores by Model and Language",
40
  xaxis_title="Language",
41
  yaxis_title="BLEU Score",
42
- barmode='group',
43
- height=500
44
  )
45
  return fig
46
 
 
47
  def create_scatter_plot(results):
48
  fig = go.Figure()
49
-
50
  x_vals = [lang["speakers"] / 1_000_000 for lang in results] # Convert to millions
51
  y_vals = [lang["bleu"] for lang in results]
52
  labels = [lang["language_name"] for lang in results]
53
-
54
- fig.add_trace(go.Scatter(
55
- x=x_vals,
56
- y=y_vals,
57
- mode='markers+text',
58
- text=labels,
59
- textposition="top center",
60
- hovertemplate="<b>%{text}</b><br>" +
61
- "Speakers: %{x:.1f}M<br>" +
62
- "BLEU Score: %{y:.3f}<extra></extra>"
63
- ))
64
-
 
 
65
  fig.update_layout(
66
  title="Language Coverage: Speakers vs BLEU Score",
67
  xaxis_title="Number of Speakers (Millions)",
68
  yaxis_title="Average BLEU Score",
69
  height=500,
70
- showlegend=False
71
  )
72
-
73
  # Use log scale for x-axis since speaker numbers vary widely
74
  fig.update_xaxes(type="log")
75
-
76
  return fig
77
 
 
78
  def create_results_df(results):
79
  # Create a list to store flattened data
80
  flat_data = []
81
-
82
  for lang in results:
 
 
 
83
  row = {
84
  "Language": lang["language_name"],
85
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
86
- "Average BLEU": round(lang["bleu"], 3),
 
 
 
87
  }
88
- # Add individual model scores
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  for score in lang["scores"]:
90
- model_name = score["model"].split('/')[-1]
91
- row[f"{model_name} BLEU"] = round(score["bleu"], 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- flat_data.append(row)
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- return pd.DataFrame(flat_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Create the visualization components
98
  with gr.Blocks(title="AI Language Translation Benchmark") as demo:
99
  gr.Markdown("# AI Language Translation Benchmark")
100
- gr.Markdown("Comparing translation performance across different AI models and languages")
101
-
 
 
102
  df = create_results_df(results)
 
103
  bar_plot = create_model_comparison_plot(results)
104
  scatter_plot = create_scatter_plot(results)
105
-
106
- gr.DataFrame(value=df, label="Translation Results", show_search="search")
107
  gr.Plot(value=bar_plot, label="Model Comparison")
 
108
  gr.Plot(value=scatter_plot, label="Language Coverage")
109
 
110
- demo.launch()
 
2
  import json
3
  import pandas as pd
4
  import plotly.graph_objects as go
 
5
 
6
  # Load and process results
7
  with open("results.json") as f:
8
  results = json.load(f)
9
 
10
+
11
  def create_model_comparison_plot(results):
12
  # Extract all unique models
13
  models = set()
 
15
  for score in lang["scores"]:
16
  models.add(score["model"])
17
  models = list(models)
18
+
19
  # Create traces for each model
20
  traces = []
21
  for model in models:
22
  x_vals = [] # languages
23
  y_vals = [] # BLEU scores
24
+
25
  for lang in results:
26
+ model_score = next(
27
+ (s["bleu"] for s in lang["scores"] if s["model"] == model), None
28
+ )
29
  if model_score is not None:
30
  x_vals.append(lang["language_name"])
31
  y_vals.append(model_score)
32
+
33
+ traces.append(
34
+ go.Bar(
35
+ name=model.split("/")[-1],
36
+ x=x_vals,
37
+ y=y_vals,
38
+ )
39
+ )
40
+
41
  fig = go.Figure(data=traces)
42
  fig.update_layout(
43
  title="BLEU Scores by Model and Language",
44
  xaxis_title="Language",
45
  yaxis_title="BLEU Score",
46
+ barmode="group",
47
+ height=500,
48
  )
49
  return fig
50
 
51
+
52
  def create_scatter_plot(results):
53
  fig = go.Figure()
54
+
55
  x_vals = [lang["speakers"] / 1_000_000 for lang in results] # Convert to millions
56
  y_vals = [lang["bleu"] for lang in results]
57
  labels = [lang["language_name"] for lang in results]
58
+
59
+ fig.add_trace(
60
+ go.Scatter(
61
+ x=x_vals,
62
+ y=y_vals,
63
+ mode="markers+text",
64
+ text=labels,
65
+ textposition="top center",
66
+ hovertemplate="<b>%{text}</b><br>"
67
+ + "Speakers: %{x:.1f}M<br>"
68
+ + "BLEU Score: %{y:.3f}<extra></extra>",
69
+ )
70
+ )
71
+
72
  fig.update_layout(
73
  title="Language Coverage: Speakers vs BLEU Score",
74
  xaxis_title="Number of Speakers (Millions)",
75
  yaxis_title="Average BLEU Score",
76
  height=500,
77
+ showlegend=False,
78
  )
79
+
80
  # Use log scale for x-axis since speaker numbers vary widely
81
  fig.update_xaxes(type="log")
82
+
83
  return fig
84
 
85
+
86
  def create_results_df(results):
87
  # Create a list to store flattened data
88
  flat_data = []
89
+
90
  for lang in results:
91
+ # Find the best model and its BLEU score
92
+ best_score = max(lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"])
93
+
94
  row = {
95
  "Language": lang["language_name"],
96
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
97
+ "Models Tested": len(lang["scores"]),
98
+ "Average BLEU": round(lang["bleu"], 3) if lang["bleu"] is not None else "N/A",
99
+ "Best Model": best_score["model"] if best_score["model"] is not None else "N/A",
100
+ "Best Model BLEU": round(best_score["bleu"], 3) if best_score["bleu"] is not None else "N/A",
101
  }
102
+ flat_data.append(row)
103
+
104
+ return pd.DataFrame(flat_data)
105
+
106
+
107
+ def create_leaderboard_df(results):
108
+ # Sort languages by average BLEU to determine resource categories
109
+ langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
110
+ sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True)
111
+ n_langs = len(sorted_langs)
112
+ high_cutoff = n_langs // 4 # top 25%
113
+ low_cutoff = n_langs - n_langs // 4 # bottom 25%
114
+
115
+ # Create sets of languages for each category
116
+ high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
117
+ low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}
118
+
119
+ # Get all model scores with categorization
120
+ model_scores = {}
121
+ for lang in results:
122
+ category = ("High-Resource" if lang["language_name"] in high_resource else
123
+ "Low-Resource" if lang["language_name"] in low_resource else
124
+ "Mid-Resource")
125
+
126
  for score in lang["scores"]:
127
+ model_name = score["model"].split("/")[-1]
128
+ if model_name not in model_scores:
129
+ model_scores[model_name] = {
130
+ "High-Resource": [],
131
+ "Mid-Resource": [],
132
+ "Low-Resource": []
133
+ }
134
+ model_scores[model_name][category].append(score["bleu"])
135
+
136
+ # Calculate average scores and create DataFrame
137
+ leaderboard_data = []
138
+ for model, categories in model_scores.items():
139
+ # Calculate averages for each category
140
+ high_avg = round(sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3) if categories["High-Resource"] else 0
141
+ mid_avg = round(sum(categories["Mid-Resource"]) / len(categories["Mid-Resource"]), 3) if categories["Mid-Resource"] else 0
142
+ low_avg = round(sum(categories["Low-Resource"]) / len(categories["Low-Resource"]), 3) if categories["Low-Resource"] else 0
143
 
144
+ # Calculate overall average
145
+ all_scores = (categories["High-Resource"] +
146
+ categories["Mid-Resource"] +
147
+ categories["Low-Resource"])
148
+ overall_avg = round(sum(all_scores) / len(all_scores), 3)
149
+
150
+ leaderboard_data.append({
151
+ "Model": model,
152
+ "Overall BLEU": overall_avg,
153
+ "High-Resource BLEU": high_avg,
154
+ "Mid-Resource BLEU": mid_avg,
155
+ "Low-Resource BLEU": low_avg,
156
+ "Languages Tested": len(all_scores),
157
+ })
158
 
159
+ # Sort by overall BLEU
160
+ df = pd.DataFrame(leaderboard_data)
161
+ df = df.sort_values("Overall BLEU", ascending=False)
162
+
163
+ # Add rank and medals
164
+ df["Rank"] = range(1, len(df) + 1)
165
+ df["Rank"] = df["Rank"].apply(
166
+ lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
167
+ )
168
+
169
+ # Reorder columns
170
+ df = df[["Rank", "Model", "Overall BLEU", "High-Resource BLEU",
171
+ "Mid-Resource BLEU", "Low-Resource BLEU", "Languages Tested"]]
172
+
173
+ return df
174
+
175
 
176
  # Create the visualization components
177
  with gr.Blocks(title="AI Language Translation Benchmark") as demo:
178
  gr.Markdown("# AI Language Translation Benchmark")
179
+ gr.Markdown(
180
+ "Comparing translation performance across different AI models and languages"
181
+ )
182
+
183
  df = create_results_df(results)
184
+ leaderboard_df = create_leaderboard_df(results)
185
  bar_plot = create_model_comparison_plot(results)
186
  scatter_plot = create_scatter_plot(results)
187
+
188
+ gr.DataFrame(value=leaderboard_df, label="Model Leaderboard", show_search=False)
189
  gr.Plot(value=bar_plot, label="Model Comparison")
190
+ gr.DataFrame(value=df, label="Language Results", show_search="search")
191
  gr.Plot(value=scatter_plot, label="Language Coverage")
192
 
193
+ demo.launch()
evals.py CHANGED
@@ -95,7 +95,7 @@ languages = pd.merge(benchmark_languages, languages, on="language_code", how="ou
95
  languages = pd.merge(languages, script_names, on="script_code", how="left")
96
  languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
97
  languages = languages.sort_values(by="speakers", ascending=False)
98
- languages = languages.iloc[:20]
99
 
100
  # sample languages to translate to
101
  target_languages_NEW = languages[languages["in_benchmark"]].sample(
@@ -103,7 +103,7 @@ target_languages_NEW = languages[languages["in_benchmark"]].sample(
103
  )
104
  # sample languages to analyze with all models
105
  detailed_languages = languages[languages["in_benchmark"]].sample(
106
- n=5, random_state=42
107
  )
108
 
109
 
@@ -214,7 +214,7 @@ async def main():
214
  "language_code": language.language_code,
215
  "speakers": language.speakers if not pd.isna(language.speakers) else 0,
216
  "scores": scores,
217
- "bleu": mean([s["bleu"] for s in scores]) or -0.02,
218
  # "bert_score": mean([s["bert_score"] for s in scores]),
219
  }
220
  )
 
95
  languages = pd.merge(languages, script_names, on="script_code", how="left")
96
  languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
97
  languages = languages.sort_values(by="speakers", ascending=False)
98
+ languages = languages.iloc[:30]
99
 
100
  # sample languages to translate to
101
  target_languages_NEW = languages[languages["in_benchmark"]].sample(
 
103
  )
104
  # sample languages to analyze with all models
105
  detailed_languages = languages[languages["in_benchmark"]].sample(
106
+ n=10, random_state=42
107
  )
108
 
109
 
 
214
  "language_code": language.language_code,
215
  "speakers": language.speakers if not pd.isna(language.speakers) else 0,
216
  "scores": scores,
217
+ "bleu": mean([s["bleu"] for s in scores]) if scores else None,
218
  # "bert_score": mean([s["bert_score"] for s in scores]),
219
  }
220
  )
results.json CHANGED
@@ -6,62 +6,42 @@
6
  "scores": [
7
  {
8
  "model": "openai/gpt-4o-mini",
9
- "bleu": 0.5103385437635193
10
  },
11
  {
12
  "model": "meta-llama/llama-3.3-70b-instruct",
13
- "bleu": 0.4845283039311465
14
  },
15
  {
16
  "model": "mistralai/mistral-small-24b-instruct-2501",
17
- "bleu": 0.4735424836788773
18
  },
19
  {
20
  "model": "google/gemini-2.0-flash-001",
21
- "bleu": 0.5639490578152662
22
  },
23
  {
24
  "model": "deepseek/deepseek-chat",
25
- "bleu": 0.5547524505965893
26
  },
27
  {
28
  "model": "microsoft/phi-4",
29
- "bleu": 0.48008677312779885
30
  }
31
  ],
32
- "bleu": 0.5111996021521995
33
  },
34
  {
35
  "language_name": "Mandarin Chinese",
36
  "language_code": "cmn",
37
  "speakers": 1074000000.0,
38
  "scores": [
39
- {
40
- "model": "openai/gpt-4o-mini",
41
- "bleu": 0.38427885971806375
42
- },
43
  {
44
  "model": "meta-llama/llama-3.3-70b-instruct",
45
- "bleu": 0.4309762560114817
46
- },
47
- {
48
- "model": "mistralai/mistral-small-24b-instruct-2501",
49
- "bleu": 0.40933363203497697
50
- },
51
- {
52
- "model": "google/gemini-2.0-flash-001",
53
- "bleu": 0.4486368724887284
54
- },
55
- {
56
- "model": "deepseek/deepseek-chat",
57
- "bleu": 0.4354691779014211
58
- },
59
- {
60
- "model": "microsoft/phi-4",
61
- "bleu": 0.3597312915524714
62
  }
63
  ],
64
- "bleu": 0.41140434828452394
65
  },
66
  {
67
  "language_name": "Spanish",
@@ -70,10 +50,10 @@
70
  "scores": [
71
  {
72
  "model": "meta-llama/llama-3.3-70b-instruct",
73
- "bleu": 0.41303609006378467
74
  }
75
  ],
76
- "bleu": 0.41303609006378467
77
  },
78
  {
79
  "language_name": "Hindi",
@@ -82,10 +62,10 @@
82
  "scores": [
83
  {
84
  "model": "meta-llama/llama-3.3-70b-instruct",
85
- "bleu": 0.39051313583666847
86
  }
87
  ],
88
- "bleu": 0.39051313583666847
89
  },
90
  {
91
  "language_name": "Bengali",
@@ -94,10 +74,10 @@
94
  "scores": [
95
  {
96
  "model": "meta-llama/llama-3.3-70b-instruct",
97
- "bleu": 0.3922760582029
98
  }
99
  ],
100
- "bleu": 0.3922760582029
101
  },
102
  {
103
  "language_name": "Portuguese",
@@ -106,10 +86,10 @@
106
  "scores": [
107
  {
108
  "model": "meta-llama/llama-3.3-70b-instruct",
109
- "bleu": 0.3569933404494365
110
  }
111
  ],
112
- "bleu": 0.3569933404494365
113
  },
114
  {
115
  "language_name": "French",
@@ -118,10 +98,10 @@
118
  "scores": [
119
  {
120
  "model": "meta-llama/llama-3.3-70b-instruct",
121
- "bleu": 0.4092873981445945
122
  }
123
  ],
124
- "bleu": 0.4092873981445945
125
  },
126
  {
127
  "language_name": "Indonesian",
@@ -130,10 +110,10 @@
130
  "scores": [
131
  {
132
  "model": "meta-llama/llama-3.3-70b-instruct",
133
- "bleu": 0.3671689105193036
134
  }
135
  ],
136
- "bleu": 0.3671689105193036
137
  },
138
  {
139
  "language_name": "Russian",
@@ -142,42 +122,62 @@
142
  "scores": [
143
  {
144
  "model": "openai/gpt-4o-mini",
145
- "bleu": 0.3821837153890323
146
  },
147
  {
148
  "model": "meta-llama/llama-3.3-70b-instruct",
149
- "bleu": 0.3974431757931015
150
  },
151
  {
152
  "model": "mistralai/mistral-small-24b-instruct-2501",
153
- "bleu": 0.2541840010941474
154
  },
155
  {
156
  "model": "google/gemini-2.0-flash-001",
157
- "bleu": 0.43388586741780116
158
  },
159
  {
160
  "model": "deepseek/deepseek-chat",
161
- "bleu": 0.4148930468752925
162
  },
163
  {
164
  "model": "microsoft/phi-4",
165
- "bleu": 0.3530948239011605
166
  }
167
  ],
168
- "bleu": 0.3726141050784226
169
  },
170
  {
171
  "language_name": "Japanese",
172
  "language_code": "jpn",
173
  "speakers": 128000000.0,
174
  "scores": [
 
 
 
 
175
  {
176
  "model": "meta-llama/llama-3.3-70b-instruct",
177
- "bleu": 0.294012705268792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  }
179
  ],
180
- "bleu": 0.294012705268792
181
  },
182
  {
183
  "language_name": "Eastern Punjabi",
@@ -186,46 +186,106 @@
186
  "scores": [
187
  {
188
  "model": "meta-llama/llama-3.3-70b-instruct",
189
- "bleu": 0.37715805829458243
190
  }
191
  ],
192
- "bleu": 0.37715805829458243
193
  },
194
  {
195
  "language_name": "Standard German",
196
  "language_code": "deu",
197
  "speakers": 105000000.0,
198
  "scores": [
 
 
 
 
199
  {
200
  "model": "meta-llama/llama-3.3-70b-instruct",
201
- "bleu": 0.39190456406769925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  }
203
  ],
204
- "bleu": 0.39190456406769925
205
  },
206
  {
207
  "language_name": "Egyptian Arabic",
208
  "language_code": "arz",
209
  "speakers": 100542400.0,
210
  "scores": [
 
 
 
 
211
  {
212
  "model": "meta-llama/llama-3.3-70b-instruct",
213
- "bleu": 0.2769739921069721
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  }
215
  ],
216
- "bleu": 0.2769739921069721
217
  },
218
  {
219
  "language_name": "Urdu",
220
  "language_code": "urd",
221
  "speakers": 94022900.0,
222
  "scores": [
 
 
 
 
223
  {
224
  "model": "meta-llama/llama-3.3-70b-instruct",
225
- "bleu": 0.30532627541695706
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  }
227
  ],
228
- "bleu": 0.30532627541695706
229
  },
230
  {
231
  "language_name": "Filipino",
@@ -234,109 +294,259 @@
234
  "scores": [
235
  {
236
  "model": "meta-llama/llama-3.3-70b-instruct",
237
- "bleu": 0.38380780370055084
238
  }
239
  ],
240
- "bleu": 0.38380780370055084
241
  },
242
  {
243
  "language_name": "Javanese",
244
  "language_code": "jav",
245
  "speakers": 84308740.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  "scores": [
247
  {
248
  "model": "openai/gpt-4o-mini",
249
- "bleu": 0.303156768433342
250
  },
251
  {
252
  "model": "meta-llama/llama-3.3-70b-instruct",
253
- "bleu": 0.3147001751424492
254
  },
255
  {
256
  "model": "mistralai/mistral-small-24b-instruct-2501",
257
- "bleu": 0.1507764424388819
258
  },
259
  {
260
  "model": "google/gemini-2.0-flash-001",
261
- "bleu": 0.41409824694226155
262
  },
263
  {
264
  "model": "deepseek/deepseek-chat",
265
- "bleu": 0.3240536705195471
266
  },
267
  {
268
  "model": "microsoft/phi-4",
269
- "bleu": 0.22770614610795217
270
  }
271
  ],
272
- "bleu": 0.2890819082640723
273
  },
274
  {
275
- "language_name": "Marathi",
276
- "language_code": "mar",
277
- "speakers": 83100000.0,
278
  "scores": [
279
  {
280
  "model": "meta-llama/llama-3.3-70b-instruct",
281
- "bleu": 0.3754377211201414
282
  }
283
  ],
284
- "bleu": 0.3754377211201414
285
  },
286
  {
287
- "language_name": "Swahili",
288
- "language_code": "swh",
289
- "speakers": 82300000.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  "scores": [
291
  {
292
  "model": "openai/gpt-4o-mini",
293
- "bleu": 0.3698648558947496
294
  },
295
  {
296
  "model": "meta-llama/llama-3.3-70b-instruct",
297
- "bleu": 0.31914577240036923
298
  },
299
  {
300
  "model": "mistralai/mistral-small-24b-instruct-2501",
301
- "bleu": 0.16066681130875948
302
  },
303
  {
304
  "model": "google/gemini-2.0-flash-001",
305
- "bleu": 0.3934769032884265
306
  },
307
  {
308
  "model": "deepseek/deepseek-chat",
309
- "bleu": 0.3605623890073268
310
  },
311
  {
312
  "model": "microsoft/phi-4",
313
- "bleu": 0.175030478984087
314
  }
315
  ],
316
- "bleu": 0.2964578684806198
317
  },
318
  {
319
- "language_name": "Turkish",
320
- "language_code": "tur",
321
- "speakers": 82231620.0,
322
  "scores": [
 
 
 
 
323
  {
324
  "model": "meta-llama/llama-3.3-70b-instruct",
325
- "bleu": 0.37080958221553817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  }
327
  ],
328
- "bleu": 0.37080958221553817
329
  },
330
  {
331
- "language_name": "Telugu",
332
- "language_code": "tel",
333
- "speakers": 82000000.0,
 
 
 
 
 
 
 
 
 
 
 
 
334
  "scores": [
335
  {
336
  "model": "meta-llama/llama-3.3-70b-instruct",
337
- "bleu": 0.35400532981470717
338
  }
339
  ],
340
- "bleu": 0.35400532981470717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  }
342
  ]
 
6
  "scores": [
7
  {
8
  "model": "openai/gpt-4o-mini",
9
+ "bleu": 0.47104084248165595
10
  },
11
  {
12
  "model": "meta-llama/llama-3.3-70b-instruct",
13
+ "bleu": 0.4207265890491719
14
  },
15
  {
16
  "model": "mistralai/mistral-small-24b-instruct-2501",
17
+ "bleu": 0.4642719176436136
18
  },
19
  {
20
  "model": "google/gemini-2.0-flash-001",
21
+ "bleu": 0.5237470882988915
22
  },
23
  {
24
  "model": "deepseek/deepseek-chat",
25
+ "bleu": 0.516570670982587
26
  },
27
  {
28
  "model": "microsoft/phi-4",
29
+ "bleu": 0.44668905281921456
30
  }
31
  ],
32
+ "bleu": 0.47384102687918905
33
  },
34
  {
35
  "language_name": "Mandarin Chinese",
36
  "language_code": "cmn",
37
  "speakers": 1074000000.0,
38
  "scores": [
 
 
 
 
39
  {
40
  "model": "meta-llama/llama-3.3-70b-instruct",
41
+ "bleu": 0.48254866511762295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  }
43
  ],
44
+ "bleu": 0.48254866511762295
45
  },
46
  {
47
  "language_name": "Spanish",
 
50
  "scores": [
51
  {
52
  "model": "meta-llama/llama-3.3-70b-instruct",
53
+ "bleu": 0.31606621368361204
54
  }
55
  ],
56
+ "bleu": 0.31606621368361204
57
  },
58
  {
59
  "language_name": "Hindi",
 
62
  "scores": [
63
  {
64
  "model": "meta-llama/llama-3.3-70b-instruct",
65
+ "bleu": 0.3273225856613046
66
  }
67
  ],
68
+ "bleu": 0.3273225856613046
69
  },
70
  {
71
  "language_name": "Bengali",
 
74
  "scores": [
75
  {
76
  "model": "meta-llama/llama-3.3-70b-instruct",
77
+ "bleu": 0.23110496173302814
78
  }
79
  ],
80
+ "bleu": 0.23110496173302814
81
  },
82
  {
83
  "language_name": "Portuguese",
 
86
  "scores": [
87
  {
88
  "model": "meta-llama/llama-3.3-70b-instruct",
89
+ "bleu": 0.35032125995743685
90
  }
91
  ],
92
+ "bleu": 0.35032125995743685
93
  },
94
  {
95
  "language_name": "French",
 
98
  "scores": [
99
  {
100
  "model": "meta-llama/llama-3.3-70b-instruct",
101
+ "bleu": 0.31625053573185663
102
  }
103
  ],
104
+ "bleu": 0.31625053573185663
105
  },
106
  {
107
  "language_name": "Indonesian",
 
110
  "scores": [
111
  {
112
  "model": "meta-llama/llama-3.3-70b-instruct",
113
+ "bleu": 0.3112185444311794
114
  }
115
  ],
116
+ "bleu": 0.3112185444311794
117
  },
118
  {
119
  "language_name": "Russian",
 
122
  "scores": [
123
  {
124
  "model": "openai/gpt-4o-mini",
125
+ "bleu": 0.32615858913589074
126
  },
127
  {
128
  "model": "meta-llama/llama-3.3-70b-instruct",
129
+ "bleu": 0.3244999119385425
130
  },
131
  {
132
  "model": "mistralai/mistral-small-24b-instruct-2501",
133
+ "bleu": 0.315801608032821
134
  },
135
  {
136
  "model": "google/gemini-2.0-flash-001",
137
+ "bleu": 0.3683733679689521
138
  },
139
  {
140
  "model": "deepseek/deepseek-chat",
141
+ "bleu": 0.35988734604889566
142
  },
143
  {
144
  "model": "microsoft/phi-4",
145
+ "bleu": 0.31289371159965956
146
  }
147
  ],
148
+ "bleu": 0.3346024224541269
149
  },
150
  {
151
  "language_name": "Japanese",
152
  "language_code": "jpn",
153
  "speakers": 128000000.0,
154
  "scores": [
155
+ {
156
+ "model": "openai/gpt-4o-mini",
157
+ "bleu": 0.28991739992953497
158
+ },
159
  {
160
  "model": "meta-llama/llama-3.3-70b-instruct",
161
+ "bleu": 0.2675679907215641
162
+ },
163
+ {
164
+ "model": "mistralai/mistral-small-24b-instruct-2501",
165
+ "bleu": 0.21348802780641032
166
+ },
167
+ {
168
+ "model": "google/gemini-2.0-flash-001",
169
+ "bleu": 0.3345265427223546
170
+ },
171
+ {
172
+ "model": "deepseek/deepseek-chat",
173
+ "bleu": 0.3101203037558905
174
+ },
175
+ {
176
+ "model": "microsoft/phi-4",
177
+ "bleu": 0.2585222780278109
178
  }
179
  ],
180
+ "bleu": 0.2790237571605942
181
  },
182
  {
183
  "language_name": "Eastern Punjabi",
 
186
  "scores": [
187
  {
188
  "model": "meta-llama/llama-3.3-70b-instruct",
189
+ "bleu": 0.27325501919134315
190
  }
191
  ],
192
+ "bleu": 0.27325501919134315
193
  },
194
  {
195
  "language_name": "Standard German",
196
  "language_code": "deu",
197
  "speakers": 105000000.0,
198
  "scores": [
199
+ {
200
+ "model": "openai/gpt-4o-mini",
201
+ "bleu": 0.39019323183176663
202
+ },
203
  {
204
  "model": "meta-llama/llama-3.3-70b-instruct",
205
+ "bleu": 0.37266353070949576
206
+ },
207
+ {
208
+ "model": "mistralai/mistral-small-24b-instruct-2501",
209
+ "bleu": 0.3647632576435612
210
+ },
211
+ {
212
+ "model": "google/gemini-2.0-flash-001",
213
+ "bleu": 0.4466723425292597
214
+ },
215
+ {
216
+ "model": "deepseek/deepseek-chat",
217
+ "bleu": 0.4045496243095387
218
+ },
219
+ {
220
+ "model": "microsoft/phi-4",
221
+ "bleu": 0.36047992103881465
222
  }
223
  ],
224
+ "bleu": 0.3898869846770727
225
  },
226
  {
227
  "language_name": "Egyptian Arabic",
228
  "language_code": "arz",
229
  "speakers": 100542400.0,
230
  "scores": [
231
+ {
232
+ "model": "openai/gpt-4o-mini",
233
+ "bleu": 0.2339779422333898
234
+ },
235
  {
236
  "model": "meta-llama/llama-3.3-70b-instruct",
237
+ "bleu": 0.20475486619797384
238
+ },
239
+ {
240
+ "model": "mistralai/mistral-small-24b-instruct-2501",
241
+ "bleu": 0.20783660453505234
242
+ },
243
+ {
244
+ "model": "google/gemini-2.0-flash-001",
245
+ "bleu": 0.2840808045687292
246
+ },
247
+ {
248
+ "model": "deepseek/deepseek-chat",
249
+ "bleu": 0.2786287793608212
250
+ },
251
+ {
252
+ "model": "microsoft/phi-4",
253
+ "bleu": 0.19969813973959594
254
  }
255
  ],
256
+ "bleu": 0.23482952277259375
257
  },
258
  {
259
  "language_name": "Urdu",
260
  "language_code": "urd",
261
  "speakers": 94022900.0,
262
  "scores": [
263
+ {
264
+ "model": "openai/gpt-4o-mini",
265
+ "bleu": 0.297325653414119
266
+ },
267
  {
268
  "model": "meta-llama/llama-3.3-70b-instruct",
269
+ "bleu": 0.24593966310665433
270
+ },
271
+ {
272
+ "model": "mistralai/mistral-small-24b-instruct-2501",
273
+ "bleu": 0.21988755291389567
274
+ },
275
+ {
276
+ "model": "google/gemini-2.0-flash-001",
277
+ "bleu": 0.31796430998058983
278
+ },
279
+ {
280
+ "model": "deepseek/deepseek-chat",
281
+ "bleu": 0.3043614136242901
282
+ },
283
+ {
284
+ "model": "microsoft/phi-4",
285
+ "bleu": 0.2285337340113323
286
  }
287
  ],
288
+ "bleu": 0.2690020545084802
289
  },
290
  {
291
  "language_name": "Filipino",
 
294
  "scores": [
295
  {
296
  "model": "meta-llama/llama-3.3-70b-instruct",
297
+ "bleu": 0.33268969497468076
298
  }
299
  ],
300
+ "bleu": 0.33268969497468076
301
  },
302
  {
303
  "language_name": "Javanese",
304
  "language_code": "jav",
305
  "speakers": 84308740.0,
306
+ "scores": [
307
+ {
308
+ "model": "meta-llama/llama-3.3-70b-instruct",
309
+ "bleu": 0.2528746866064681
310
+ }
311
+ ],
312
+ "bleu": 0.2528746866064681
313
+ },
314
+ {
315
+ "language_name": "Marathi",
316
+ "language_code": "mar",
317
+ "speakers": 83100000.0,
318
+ "scores": [
319
+ {
320
+ "model": "meta-llama/llama-3.3-70b-instruct",
321
+ "bleu": 0.24876051941895777
322
+ }
323
+ ],
324
+ "bleu": 0.24876051941895777
325
+ },
326
+ {
327
+ "language_name": "Swahili",
328
+ "language_code": "swh",
329
+ "speakers": 82300000.0,
330
  "scores": [
331
  {
332
  "model": "openai/gpt-4o-mini",
333
+ "bleu": 0.34863560100932933
334
  },
335
  {
336
  "model": "meta-llama/llama-3.3-70b-instruct",
337
+ "bleu": 0.30524292832054034
338
  },
339
  {
340
  "model": "mistralai/mistral-small-24b-instruct-2501",
341
+ "bleu": 0.23580256234118713
342
  },
343
  {
344
  "model": "google/gemini-2.0-flash-001",
345
+ "bleu": 0.3871437234807849
346
  },
347
  {
348
  "model": "deepseek/deepseek-chat",
349
+ "bleu": 0.3476225063617937
350
  },
351
  {
352
  "model": "microsoft/phi-4",
353
+ "bleu": 0.21803176063271826
354
  }
355
  ],
356
+ "bleu": 0.3070798470243923
357
  },
358
  {
359
+ "language_name": "Turkish",
360
+ "language_code": "tur",
361
+ "speakers": 82231620.0,
362
  "scores": [
363
  {
364
  "model": "meta-llama/llama-3.3-70b-instruct",
365
+ "bleu": 0.29874140544434125
366
  }
367
  ],
368
+ "bleu": 0.29874140544434125
369
  },
370
  {
371
+ "language_name": "Telugu",
372
+ "language_code": "tel",
373
+ "speakers": 82000000.0,
374
+ "scores": [
375
+ {
376
+ "model": "meta-llama/llama-3.3-70b-instruct",
377
+ "bleu": 0.28869836899054496
378
+ }
379
+ ],
380
+ "bleu": 0.28869836899054496
381
+ },
382
+ {
383
+ "language_name": "Wu Chinese",
384
+ "language_code": "wuu",
385
+ "speakers": 81400000.0,
386
+ "scores": [],
387
+ "bleu": null
388
+ },
389
+ {
390
+ "language_name": "Korean",
391
+ "language_code": "kor",
392
+ "speakers": 77300000.0,
393
+ "scores": [
394
+ {
395
+ "model": "meta-llama/llama-3.3-70b-instruct",
396
+ "bleu": 0.2566453806044083
397
+ }
398
+ ],
399
+ "bleu": 0.2566453806044083
400
+ },
401
+ {
402
+ "language_name": "Vietnamese",
403
+ "language_code": "vie",
404
+ "speakers": 76000000.0,
405
  "scores": [
406
  {
407
  "model": "openai/gpt-4o-mini",
408
+ "bleu": 0.3104431723374164
409
  },
410
  {
411
  "model": "meta-llama/llama-3.3-70b-instruct",
412
+ "bleu": 0.3098478561790782
413
  },
414
  {
415
  "model": "mistralai/mistral-small-24b-instruct-2501",
416
+ "bleu": 0.28074941515909896
417
  },
418
  {
419
  "model": "google/gemini-2.0-flash-001",
420
+ "bleu": 0.37327273228460267
421
  },
422
  {
423
  "model": "deepseek/deepseek-chat",
424
+ "bleu": 0.3487726531917602
425
  },
426
  {
427
  "model": "microsoft/phi-4",
428
+ "bleu": 0.18355331419148843
429
  }
430
  ],
431
+ "bleu": 0.3011065238905742
432
  },
433
  {
434
+ "language_name": "Tamil",
435
+ "language_code": "tam",
436
+ "speakers": 75000000.0,
437
  "scores": [
438
+ {
439
+ "model": "openai/gpt-4o-mini",
440
+ "bleu": 0.24593649157372188
441
+ },
442
  {
443
  "model": "meta-llama/llama-3.3-70b-instruct",
444
+ "bleu": 0.24009996232522382
445
+ },
446
+ {
447
+ "model": "mistralai/mistral-small-24b-instruct-2501",
448
+ "bleu": 0.16785828803139252
449
+ },
450
+ {
451
+ "model": "google/gemini-2.0-flash-001",
452
+ "bleu": 0.3411457686951495
453
+ },
454
+ {
455
+ "model": "deepseek/deepseek-chat",
456
+ "bleu": 0.2875340171253509
457
+ },
458
+ {
459
+ "model": "microsoft/phi-4",
460
+ "bleu": 0.12646276530642359
461
  }
462
  ],
463
+ "bleu": 0.23483954884287706
464
  },
465
  {
466
+ "language_name": "Yue Chinese",
467
+ "language_code": "yue",
468
+ "speakers": 73100000.0,
469
+ "scores": [
470
+ {
471
+ "model": "meta-llama/llama-3.3-70b-instruct",
472
+ "bleu": 0.2663995648378034
473
+ }
474
+ ],
475
+ "bleu": 0.2663995648378034
476
+ },
477
+ {
478
+ "language_name": "Italian",
479
+ "language_code": "ita",
480
+ "speakers": 64819790.0,
481
  "scores": [
482
  {
483
  "model": "meta-llama/llama-3.3-70b-instruct",
484
+ "bleu": 0.3190660116366235
485
  }
486
  ],
487
+ "bleu": 0.3190660116366235
488
+ },
489
+ {
490
+ "language_name": "Gujarati",
491
+ "language_code": "guj",
492
+ "speakers": 56400000.0,
493
+ "scores": [
494
+ {
495
+ "model": "openai/gpt-4o-mini",
496
+ "bleu": 0.25754571533357745
497
+ },
498
+ {
499
+ "model": "meta-llama/llama-3.3-70b-instruct",
500
+ "bleu": 0.24145756515188838
501
+ },
502
+ {
503
+ "model": "mistralai/mistral-small-24b-instruct-2501",
504
+ "bleu": 0.20092063514315023
505
+ },
506
+ {
507
+ "model": "google/gemini-2.0-flash-001",
508
+ "bleu": 0.3664134239402827
509
+ },
510
+ {
511
+ "model": "deepseek/deepseek-chat",
512
+ "bleu": 0.2908883229704476
513
+ },
514
+ {
515
+ "model": "microsoft/phi-4",
516
+ "bleu": 0.19669824113063106
517
+ }
518
+ ],
519
+ "bleu": 0.2589873172783296
520
+ },
521
+ {
522
+ "language_name": "Iranian Persian",
523
+ "language_code": "pes",
524
+ "speakers": 52800000.0,
525
+ "scores": [
526
+ {
527
+ "model": "meta-llama/llama-3.3-70b-instruct",
528
+ "bleu": 0.28359916806993934
529
+ }
530
+ ],
531
+ "bleu": 0.28359916806993934
532
+ },
533
+ {
534
+ "language_name": "Bhojpuri",
535
+ "language_code": "bho",
536
+ "speakers": 52200000.0,
537
+ "scores": [
538
+ {
539
+ "model": "meta-llama/llama-3.3-70b-instruct",
540
+ "bleu": 0.24311504988281543
541
+ }
542
+ ],
543
+ "bleu": 0.24311504988281543
544
+ },
545
+ {
546
+ "language_name": "Hakka Chinese",
547
+ "language_code": "hak",
548
+ "speakers": 48200000.0,
549
+ "scores": [],
550
+ "bleu": null
551
  }
552
  ]