David Pomerenke commited on
Commit
4f572a5
·
1 Parent(s): e92634d

Metrics selector & refactoring

Browse files
Files changed (3) hide show
  1. app.py +253 -106
  2. evals.py +28 -26
  3. results.json +210 -210
app.py CHANGED
@@ -2,22 +2,74 @@ import json
2
 
3
  import gradio as gr
4
  import pandas as pd
5
- import plotly.graph_objects as go
6
  import plotly.express as px
 
7
  import pycountry
8
 
9
  with open("results.json") as f:
10
  results = json.load(f)
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def mean(lst):
14
  return sum(lst) / len(lst)
15
 
16
 
17
- def create_leaderboard_df(results):
18
  # Sort languages by average BLEU to determine resource categories
19
- langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
20
- sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True)
21
  n_langs = len(sorted_langs)
22
  high_cutoff = n_langs // 4 # top 25%
23
  low_cutoff = n_langs - n_langs // 4 # bottom 25%
@@ -45,7 +97,7 @@ def create_leaderboard_df(results):
45
  "Mid-Resource": [],
46
  "Low-Resource": [],
47
  }
48
- model_scores[model][category].append(score["bleu"])
49
 
50
  # Calculate average scores and create DataFrame
51
  leaderboard_data = []
@@ -79,17 +131,17 @@ def create_leaderboard_df(results):
79
  leaderboard_data.append(
80
  {
81
  "Model": f"[{model_name}](https://openrouter.ai/{model})",
82
- "Overall BLEU": overall_avg,
83
- "High-Resource BLEU": high_avg,
84
- "Mid-Resource BLEU": mid_avg,
85
- "Low-Resource BLEU": low_avg,
86
  "Languages Tested": len(all_scores),
87
  }
88
  )
89
 
90
  # Sort by overall BLEU
91
  df = pd.DataFrame(leaderboard_data)
92
- df = df.sort_values("Overall BLEU", ascending=False)
93
 
94
  # Add rank and medals
95
  df["Rank"] = range(1, len(df) + 1)
@@ -102,10 +154,10 @@ def create_leaderboard_df(results):
102
  [
103
  "Rank",
104
  "Model",
105
- "Overall BLEU",
106
- "High-Resource BLEU",
107
- "Mid-Resource BLEU",
108
- "Low-Resource BLEU",
109
  "Languages Tested",
110
  ]
111
  ]
@@ -126,19 +178,34 @@ def create_leaderboard_df(results):
126
  )
127
 
128
 
129
- def create_model_comparison_plot(results):
130
  top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]
131
- scores_flat = [
132
- {"language": lang["language_name"], "model": score["model"], "bleu": score["bleu"]}
133
- for lang in top_languages
134
- for score in lang["scores"]
135
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  df = pd.DataFrame(scores_flat)
137
- fig = px.bar(df, x="language", y="bleu", color="model", barmode="group")
138
  fig.update_layout(
139
- title="BLEU Scores by Model and Language",
140
  xaxis_title=None,
141
- yaxis_title="BLEU Score",
142
  barmode="group",
143
  height=500,
144
  legend=dict(
@@ -152,17 +219,18 @@ def create_model_comparison_plot(results):
152
  return fig
153
 
154
 
155
- def create_language_stats_df(results):
156
  # Create a list to store flattened data
157
  flat_data = []
158
 
159
  for lang in results:
160
  # Find the best model and its BLEU score
161
- best_score = max(
162
- lang["scores"] or [{"overall_score": None, "model": None}], key=lambda x: x["overall_score"]
 
163
  )
164
 
165
- model = best_score["model"]
166
  model_name = model.split("/")[-1] if model else "N/A"
167
  model_link = (
168
  f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
@@ -181,14 +249,14 @@ def create_language_stats_df(results):
181
  "Overall": round(lang["overall_score"], 3)
182
  if lang["overall_score"] is not None
183
  else "N/A",
184
- "Trans-lation": round(lang["bleu"], 3)
185
- if lang["bleu"] is not None
186
  else "N/A",
187
- "Classi-fication": round(lang["accuracy"], 3)
188
- if lang["accuracy"] is not None
189
  else "N/A",
190
- "MLM": round(lang["mlm"], 3)
191
- if lang["mlm"] is not None
192
  else "N/A",
193
  "Best Model": model_link,
194
  "CommonVoice Hours": commonvoice_link,
@@ -201,27 +269,54 @@ def create_language_stats_df(results):
201
  label="Language Results",
202
  show_search="search",
203
  datatype=[
204
- "markdown", # Language
205
- "number", # Speakers
206
  # "number", # Models Tested
207
- "number", # Overall
208
- "number", # Translation
209
- "number", # Classification
210
- "number", # MLM
211
- "markdown", # Best Model
212
- "markdown", # CommonVoice Hours
213
  ],
214
  )
215
 
216
 
217
- def create_scatter_plot(results):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  fig = go.Figure()
219
 
 
220
  x_vals = [
221
- lang["speakers"] / 1_000_000 for lang in results if lang["speakers"] >= 10_000
222
  ] # Convert to millions
223
- y_vals = [lang["bleu"] for lang in results]
224
- labels = [lang["language_name"] for lang in results]
 
 
 
225
 
226
  fig.add_trace(
227
  go.Scatter(
@@ -230,16 +325,14 @@ def create_scatter_plot(results):
230
  mode="markers+text",
231
  text=labels,
232
  textposition="top center",
233
- hovertemplate="<b>%{text}</b><br>"
234
- + "Speakers: %{x:.1f}M<br>"
235
- + "BLEU Score: %{y:.3f}<extra></extra>",
236
  )
237
  )
238
 
239
  fig.update_layout(
240
  title=None,
241
  xaxis_title="Number of Speakers (Millions)",
242
- yaxis_title="Average BLEU Score",
243
  height=500,
244
  showlegend=False,
245
  )
@@ -247,7 +340,7 @@ def create_scatter_plot(results):
247
  # Use log scale for x-axis since speaker numbers vary widely
248
  fig.update_xaxes(type="log")
249
 
250
- return gr.Plot(value=fig, label="Speaker population vs BLEU")
251
 
252
 
253
  def format_number(n):
@@ -258,8 +351,10 @@ def format_number(n):
258
  return f"{n/1_000:.0f}K"
259
  return str(n)
260
 
 
261
  def get_population_data():
262
  import xml.etree.ElementTree as ET
 
263
  from language_data.util import data_filename
264
 
265
  filename = data_filename("supplementalData.xml")
@@ -268,17 +363,46 @@ def get_population_data():
268
 
269
  data = {}
270
  for territory in territories:
271
- t_code = territory.attrib['type']
272
- t_population = float(territory.attrib['population'])
273
  data[t_code] = t_population
274
  return data
275
 
276
- def create_world_map(results):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  # Collect all country data
278
  population_data = get_population_data()
279
  country_data = {}
280
  for lang in results:
281
- if "population" not in lang or lang["bleu"] is None:
 
282
  continue
283
 
284
  for country_code, speakers in lang["population"].items():
@@ -293,17 +417,19 @@ def create_world_map(results):
293
  country_data[iso3_code] = {
294
  "total_speakers": 0,
295
  "population": population_data.get(country_code, 0),
296
- "weighted_bleu_sum": 0,
297
  "languages": [],
298
  }
299
 
300
  country_data[iso3_code]["total_speakers"] += speakers
301
- country_data[iso3_code]["weighted_bleu_sum"] += speakers * lang["bleu"]
 
 
302
  country_data[iso3_code]["languages"].append(
303
  {
304
  "name": lang["language_name"],
305
  "speakers": speakers,
306
- "bleu": lang["bleu"],
307
  }
308
  )
309
  except (KeyError, AttributeError):
@@ -312,33 +438,11 @@ def create_world_map(results):
312
 
313
  # Calculate final weighted averages and prepare hover text
314
  countries = []
315
- bleu_scores = []
316
  hover_texts = []
317
 
318
- def make_black_bar(value, max_width=10):
319
- filled = int(value * max_width)
320
- return "⬛️" * filled + "⬜️" * (max_width - filled)
321
-
322
- def make_colored_bar(value, max_width=10):
323
- """Create a colored bar using Unicode blocks
324
- 🟦 for high values (>0.35)
325
- 🟨 for medium values (0.25-0.35)
326
- 🟥 for low values (<0.25)
327
- ⬜ for empty space
328
- """
329
- filled = int(value * max_width)
330
- filled = max(0, min(filled, max_width))
331
- empty = max_width - filled
332
-
333
- if value > 0.35:
334
- return "🟦" * filled + "⬜" * empty
335
- elif value > 0.25:
336
- return "🟨" * filled + "⬜" * empty
337
- else:
338
- return "🟥" * filled + "⬜" * empty
339
-
340
  for country_code, data in country_data.items():
341
- weighted_avg = data["weighted_bleu_sum"] / data["total_speakers"]
342
 
343
  try:
344
  country_name = pycountry.countries.get(alpha_3=country_code).name
@@ -357,38 +461,39 @@ def create_world_map(results):
357
  for lang in main_langs:
358
  percentage = (lang["speakers"] / data["population"]) * 100
359
  speaker_bar = make_black_bar(percentage / 100)
360
- bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
 
 
361
 
362
  lang_rows.append(
363
  f"<b>{lang['name']}</b><br>"
364
  f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
365
- f"{bleu_bar} {lang['bleu']:.3f} BLEU<br>"
366
  )
367
 
368
  # Add summary for other languages if any
369
  if other_langs:
370
  other_speakers = sum(lang["speakers"] for lang in other_langs)
371
  other_percentage = (other_speakers / data["population"]) * 100
372
- other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
373
  other_langs
374
  )
375
 
376
  speaker_bar = make_black_bar(other_percentage / 100)
377
- bleu_bar = make_colored_bar((other_avg_bleu - 0.2) / 0.2)
 
 
378
 
379
  lang_rows.append(
380
  f"<b>+{len(other_langs)} other languages</b><br>"
381
  f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
382
- f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
383
  )
384
 
385
- hover_text = (
386
- f"<b>{country_name}</b><br><br>"
387
- f"{'<br>'.join(lang_rows)}"
388
- )
389
 
390
  countries.append(country_code)
391
- bleu_scores.append(weighted_avg)
392
  hover_texts.append(hover_text)
393
 
394
  # Create the choropleth map
@@ -396,12 +501,12 @@ def create_world_map(results):
396
  data=go.Choropleth(
397
  locations=countries,
398
  locationmode="ISO-3",
399
- z=bleu_scores,
400
  text=hover_texts,
401
  hoverinfo="text",
402
  colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
403
  colorbar=dict(
404
- title="BLEU Score",
405
  orientation="h", # horizontal orientation
406
  y=-0.2, # position below map
407
  yanchor="bottom",
@@ -410,13 +515,11 @@ def create_world_map(results):
410
  xanchor="center",
411
  thickness=20, # make it a bit thicker when horizontal
412
  ),
413
- zmin=0.1,
414
- zmax=0.5,
415
  )
416
  )
417
 
418
  fig.update_layout(
419
- title=dict(text="BLEU Score by Country", x=0.5, xanchor="center"),
420
  geo=dict(
421
  showframe=True,
422
  showcoastlines=True,
@@ -437,22 +540,48 @@ def create_world_map(results):
437
 
438
  return fig
439
 
 
 
 
440
 
441
  # Create the visualization components
442
- with gr.Blocks(title="AI Language Translation Benchmark") as demo:
443
- gr.Markdown("# AI Language Translation Benchmark")
444
  gr.Markdown(
445
- "Comparing translation performance across different AI models and languages"
 
 
 
 
 
 
 
 
 
 
 
446
  )
 
447
 
448
- bar_plot = create_model_comparison_plot(results)
449
- world_map = create_world_map(results)
 
 
 
 
450
 
451
- create_leaderboard_df(results)
452
- gr.Plot(value=bar_plot, label="Model Comparison")
453
- create_language_stats_df(results)
454
- create_scatter_plot(results)
455
- gr.Plot(value=world_map, container=False, elem_classes="fullwidth-plot")
 
 
 
 
 
 
 
456
 
457
  gr.Markdown(
458
  """
@@ -475,5 +604,23 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
475
  """,
476
  container=True,
477
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
478
 
479
  demo.launch()
 
2
 
3
  import gradio as gr
4
  import pandas as pd
 
5
  import plotly.express as px
6
+ import plotly.graph_objects as go
7
  import pycountry
8
 
9
  with open("results.json") as f:
10
  results = json.load(f)
11
 
12
+ # Global constants for metric mappings
13
+ METRICS = {
14
+ "overall_performance": {
15
+ "display_name": "Overall Performance",
16
+ "field_name": "overall_score",
17
+ "label": "Overall Performance Score",
18
+ "explanation": """
19
+ **Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
20
+ Higher scores indicate better overall language capabilities.
21
+ """,
22
+ },
23
+ "translation_bleu": {
24
+ "display_name": "Translation (BLEU)",
25
+ "field_name": "mt_bleu",
26
+ "label": "BLEU Score",
27
+ "explanation": """
28
+ **Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
29
+ It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
30
+ """,
31
+ },
32
+ "translation_chrf": {
33
+ "display_name": "Translation (ChrF)",
34
+ "field_name": "mt_chrf",
35
+ "label": "ChrF Score",
36
+ "explanation": """
37
+ **Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
38
+ This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
39
+ Higher scores (0-1) indicate better translations.
40
+ """,
41
+ },
42
+ "classification_accuracy": {
43
+ "display_name": "Classification (Accuracy)",
44
+ "field_name": "cls_acc",
45
+ "label": "Classification Accuracy",
46
+ "explanation": """
47
+ **Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
48
+ This evaluates a model's understanding of content and context across different languages.
49
+ Reported as a percentage where higher values indicate better classification performance.
50
+ """,
51
+ },
52
+ "mlm_chrf": {
53
+ "display_name": "Masked Language Modeling (ChrF)",
54
+ "field_name": "mlm_chrf",
55
+ "label": "MLM ChrF Score",
56
+ "explanation": """
57
+ **Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
58
+ This tests a model's understanding of language structure and semantics by measuring the character-level similarity
59
+ between predicted and actual text. Higher scores indicate better language understanding.
60
+ """,
61
+ },
62
+ }
63
+
64
 
65
  def mean(lst):
66
  return sum(lst) / len(lst)
67
 
68
 
69
+ def create_leaderboard_df(metric):
70
  # Sort languages by average BLEU to determine resource categories
71
+ langs_with_score = [lang for lang in results if lang[metric['field_name']] is not None]
72
+ sorted_langs = sorted(langs_with_score, key=lambda x: x[metric['field_name']], reverse=True)
73
  n_langs = len(sorted_langs)
74
  high_cutoff = n_langs // 4 # top 25%
75
  low_cutoff = n_langs - n_langs // 4 # bottom 25%
 
97
  "Mid-Resource": [],
98
  "Low-Resource": [],
99
  }
100
+ model_scores[model][category].append(score[metric['field_name']])
101
 
102
  # Calculate average scores and create DataFrame
103
  leaderboard_data = []
 
131
  leaderboard_data.append(
132
  {
133
  "Model": f"[{model_name}](https://openrouter.ai/{model})",
134
+ "Overall Score": overall_avg,
135
+ "High-Resource Score": high_avg,
136
+ "Mid-Resource Score": mid_avg,
137
+ "Low-Resource Score": low_avg,
138
  "Languages Tested": len(all_scores),
139
  }
140
  )
141
 
142
  # Sort by overall BLEU
143
  df = pd.DataFrame(leaderboard_data)
144
+ df = df.sort_values("Overall Score", ascending=False)
145
 
146
  # Add rank and medals
147
  df["Rank"] = range(1, len(df) + 1)
 
154
  [
155
  "Rank",
156
  "Model",
157
+ "Overall Score",
158
+ "High-Resource Score",
159
+ "Mid-Resource Score",
160
+ "Low-Resource Score",
161
  "Languages Tested",
162
  ]
163
  ]
 
178
  )
179
 
180
 
181
+ def create_model_comparison_plot(metric):
182
  top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]
183
+
184
+ # Create appropriate title and y-axis label based on metric
185
+ title = f"{metric['display_name']} by Model and Language"
186
+ y_label = metric['label']
187
+
188
+ # Flatten the data for the selected metric
189
+ scores_flat = []
190
+ for lang in top_languages:
191
+ for score in lang["scores"]:
192
+ # Get the value directly using the field name
193
+ value = score[metric['field_name']]
194
+ if value is not None:
195
+ scores_flat.append(
196
+ {
197
+ "language": lang["language_name"],
198
+ "model": score["model"],
199
+ "value": value,
200
+ }
201
+ )
202
+
203
  df = pd.DataFrame(scores_flat)
204
+ fig = px.bar(df, x="language", y="value", color="model", barmode="group")
205
  fig.update_layout(
206
+ title=title,
207
  xaxis_title=None,
208
+ yaxis_title=y_label,
209
  barmode="group",
210
  height=500,
211
  legend=dict(
 
219
  return fig
220
 
221
 
222
+ def create_language_stats_df(metric):
223
  # Create a list to store flattened data
224
  flat_data = []
225
 
226
  for lang in results:
227
  # Find the best model and its BLEU score
228
+ best_model = max(
229
+ lang["scores"] or [{"overall_score": None, "model": None}],
230
+ key=lambda x: x["overall_score"],
231
  )
232
 
233
+ model = best_model["model"]
234
  model_name = model.split("/")[-1] if model else "N/A"
235
  model_link = (
236
  f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
 
249
  "Overall": round(lang["overall_score"], 3)
250
  if lang["overall_score"] is not None
251
  else "N/A",
252
+ "Trans-lation": round(lang["mt_bleu"], 3)
253
+ if lang["mt_bleu"] is not None
254
  else "N/A",
255
+ "Classi-fication": round(lang["cls_acc"], 3)
256
+ if lang["cls_acc"] is not None
257
  else "N/A",
258
+ "MLM": round(lang["mlm_chrf"], 3)
259
+ if lang["mlm_chrf"] is not None
260
  else "N/A",
261
  "Best Model": model_link,
262
  "CommonVoice Hours": commonvoice_link,
 
269
  label="Language Results",
270
  show_search="search",
271
  datatype=[
272
+ "markdown", # Language
273
+ "number", # Speakers
274
  # "number", # Models Tested
275
+ "number", # Overall
276
+ "number", # Translation
277
+ "number", # Classification
278
+ "number", # MLM
279
+ "markdown", # Best Model
280
+ "markdown", # CommonVoice Hours
281
  ],
282
  )
283
 
284
 
285
+ def create_scatter_plot(metric):
286
+ # Filter results to include only languages with sufficient speakers
287
+ filtered_results = [lang for lang in results if lang["speakers"] >= 10_000]
288
+
289
+ # Create a list to store data for the scatter plot
290
+ scatter_data = []
291
+
292
+ for lang in filtered_results:
293
+ # Calculate average score for this metric across all models
294
+ scores = [
295
+ score[metric['field_name']]
296
+ for score in lang["scores"]
297
+ if score[metric['field_name']] is not None
298
+ ]
299
+ if scores: # Only include if we have valid scores
300
+ avg_score = sum(scores) / len(scores)
301
+ scatter_data.append(
302
+ {
303
+ "language": lang["language_name"],
304
+ "speakers": lang["speakers"],
305
+ "score": avg_score,
306
+ }
307
+ )
308
+
309
  fig = go.Figure()
310
 
311
+ # Convert speakers to millions for display
312
  x_vals = [
313
+ data["speakers"] / 1_000_000 for data in scatter_data
314
  ] # Convert to millions
315
+ y_vals = [data["score"] for data in scatter_data]
316
+ labels = [data["language"] for data in scatter_data]
317
+
318
+ # Create hover template
319
+ hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
320
 
321
  fig.add_trace(
322
  go.Scatter(
 
325
  mode="markers+text",
326
  text=labels,
327
  textposition="top center",
328
+ hovertemplate=hover_template,
 
 
329
  )
330
  )
331
 
332
  fig.update_layout(
333
  title=None,
334
  xaxis_title="Number of Speakers (Millions)",
335
+ yaxis_title=metric['label'],
336
  height=500,
337
  showlegend=False,
338
  )
 
340
  # Use log scale for x-axis since speaker numbers vary widely
341
  fig.update_xaxes(type="log")
342
 
343
+ return fig
344
 
345
 
346
  def format_number(n):
 
351
  return f"{n/1_000:.0f}K"
352
  return str(n)
353
 
354
+
355
  def get_population_data():
356
  import xml.etree.ElementTree as ET
357
+
358
  from language_data.util import data_filename
359
 
360
  filename = data_filename("supplementalData.xml")
 
363
 
364
  data = {}
365
  for territory in territories:
366
+ t_code = territory.attrib["type"]
367
+ t_population = float(territory.attrib["population"])
368
  data[t_code] = t_population
369
  return data
370
 
371
+ # Helper functions for visualization
372
+ def make_black_bar(value, max_width=10):
373
+ filled = int(value * max_width)
374
+ return "⬛️" * filled + "⬜️" * (max_width - filled)
375
+
376
+
377
+ def make_colored_bar(score, max_width=10):
378
+ """Create a colored bar using Unicode blocks based on normalized score
379
+ 🟦 for high values (>0.35)
380
+ 🟨 for medium values (0.25-0.35)
381
+ 🟥 for low values (<0.25)
382
+ ⬜ for empty space
383
+
384
+ This function handles both normalization and bar creation.
385
+ """
386
+
387
+ # Create the bar based on normalized value
388
+ filled = int(score * max_width)
389
+ filled = max(0, min(filled, max_width))
390
+ empty = max_width - filled
391
+
392
+ if score > 0.35:
393
+ return "🟦" * filled + "⬜" * empty
394
+ elif score > 0.25:
395
+ return "🟨" * filled + "⬜" * empty
396
+ else:
397
+ return "🟥" * filled + "⬜" * empty
398
+
399
+ def create_world_map(metric):
400
  # Collect all country data
401
  population_data = get_population_data()
402
  country_data = {}
403
  for lang in results:
404
+ # Skip languages without the required data
405
+ if "population" not in lang or lang[metric['field_name']] is None:
406
  continue
407
 
408
  for country_code, speakers in lang["population"].items():
 
417
  country_data[iso3_code] = {
418
  "total_speakers": 0,
419
  "population": population_data.get(country_code, 0),
420
+ "weighted_score_sum": 0,
421
  "languages": [],
422
  }
423
 
424
  country_data[iso3_code]["total_speakers"] += speakers
425
+ country_data[iso3_code]["weighted_score_sum"] += (
426
+ speakers * lang[metric['field_name']]
427
+ )
428
  country_data[iso3_code]["languages"].append(
429
  {
430
  "name": lang["language_name"],
431
  "speakers": speakers,
432
+ "score": lang[metric['field_name']],
433
  }
434
  )
435
  except (KeyError, AttributeError):
 
438
 
439
  # Calculate final weighted averages and prepare hover text
440
  countries = []
441
+ scores = []
442
  hover_texts = []
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
  for country_code, data in country_data.items():
445
+ weighted_avg = data["weighted_score_sum"] / data["total_speakers"]
446
 
447
  try:
448
  country_name = pycountry.countries.get(alpha_3=country_code).name
 
461
  for lang in main_langs:
462
  percentage = (lang["speakers"] / data["population"]) * 100
463
  speaker_bar = make_black_bar(percentage / 100)
464
+
465
+ # Use the integrated make_colored_bar function directly
466
+ score_bar = make_colored_bar(lang["score"])
467
 
468
  lang_rows.append(
469
  f"<b>{lang['name']}</b><br>"
470
  f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
471
+ f"{score_bar} {lang['score']:.3f} {metric['label']}<br>"
472
  )
473
 
474
  # Add summary for other languages if any
475
  if other_langs:
476
  other_speakers = sum(lang["speakers"] for lang in other_langs)
477
  other_percentage = (other_speakers / data["population"]) * 100
478
+ other_avg_score = sum(lang["score"] for lang in other_langs) / len(
479
  other_langs
480
  )
481
 
482
  speaker_bar = make_black_bar(other_percentage / 100)
483
+
484
+ # Use the integrated make_colored_bar function directly
485
+ score_bar = make_colored_bar(other_avg_score)
486
 
487
  lang_rows.append(
488
  f"<b>+{len(other_langs)} other languages</b><br>"
489
  f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
490
+ f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>"
491
  )
492
 
493
+ hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}"
 
 
 
494
 
495
  countries.append(country_code)
496
+ scores.append(weighted_avg)
497
  hover_texts.append(hover_text)
498
 
499
  # Create the choropleth map
 
501
  data=go.Choropleth(
502
  locations=countries,
503
  locationmode="ISO-3",
504
+ z=scores,
505
  text=hover_texts,
506
  hoverinfo="text",
507
  colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
508
  colorbar=dict(
509
+ title=metric['label'],
510
  orientation="h", # horizontal orientation
511
  y=-0.2, # position below map
512
  yanchor="bottom",
 
515
  xanchor="center",
516
  thickness=20, # make it a bit thicker when horizontal
517
  ),
 
 
518
  )
519
  )
520
 
521
  fig.update_layout(
522
+ title=dict(text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"),
523
  geo=dict(
524
  showframe=True,
525
  showcoastlines=True,
 
540
 
541
  return fig
542
 
543
+ def create_metric_explanation(metric):
544
+ return gr.Markdown(metric['explanation'])
545
+
546
 
547
  # Create the visualization components
548
+ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
549
+ gr.Markdown("# AI Language Proficiency Benchmark")
550
  gr.Markdown(
551
+ "Comparing language proficiency across different models and languages."
552
+ )
553
+ start_metric = METRICS["overall_performance"]
554
+
555
+ metric = gr.Dropdown(
556
+ choices=[
557
+ metric_info["display_name"]
558
+ for metric_info in METRICS.values()
559
+ ],
560
+ value=start_metric["display_name"],
561
+ label="Select Metric",
562
+ interactive=True,
563
  )
564
+ metric_explanation = create_metric_explanation(start_metric)
565
 
566
+ gr.Markdown("## Model Comparison")
567
+ create_leaderboard_df(start_metric)
568
+ model_comparison_plot = gr.Plot(
569
+ value=create_model_comparison_plot(start_metric),
570
+ label="Model Comparison",
571
+ )
572
 
573
+ gr.Markdown("## Language Stats")
574
+ create_language_stats_df(start_metric)
575
+ scatter_plot = gr.Plot(
576
+ value=create_scatter_plot(start_metric),
577
+ label="Speaker Population vs. Metric",
578
+ )
579
+ world_map = gr.Plot(
580
+ value=create_world_map(start_metric),
581
+ label="World Map",
582
+ container=False,
583
+ elem_classes="fullwidth-plot",
584
+ )
585
 
586
  gr.Markdown(
587
  """
 
604
  """,
605
  container=True,
606
  )
607
+
608
+ def update_component(fn, metric_choice):
609
+ metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
610
+ return fn(metric)
611
+
612
+ from functools import partial
613
+
614
+ # Connect the dropdown to update all plots
615
+ metric.change(fn=partial(update_component, create_metric_explanation), inputs=metric, outputs=metric_explanation)
616
+ metric.change(
617
+ fn=partial(update_component, create_model_comparison_plot), inputs=metric, outputs=model_comparison_plot
618
+ )
619
+ metric.change(
620
+ fn=partial(update_component, create_scatter_plot), inputs=metric, outputs=scatter_plot
621
+ )
622
+ metric.change(
623
+ fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
624
+ )
625
 
626
  demo.launch()
evals.py CHANGED
@@ -209,8 +209,8 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
209
  return {
210
  "model": model,
211
  "bcp_47": original_language["bcp_47"],
212
- "bleu": bleu_score["bleu"],
213
- "chrf": chrf_score["score"],
214
  "sentence_nr": sentence_nr,
215
  }
216
 
@@ -316,7 +316,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
316
  return {
317
  "model": model,
318
  "bcp_47": language["bcp_47"],
319
- "chrf": chrf_score["score"],
320
  "sentence_nr": nr,
321
  }
322
 
@@ -352,7 +352,7 @@ async def main():
352
  classification_scores = await tqdm_asyncio.gather(
353
  *classification_scores, miniters=1
354
  )
355
- print("evaluate mlm")
356
  mlm_scores = [
357
  mlm_and_evaluate(model, language.bcp_47, i)
358
  for i in range(n_sentences)
@@ -362,9 +362,9 @@ async def main():
362
  and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
363
  ]
364
  mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
365
- results = []
366
  for language in languages.itertuples():
367
- results_for_language = []
368
  for model in models:
369
  translations_for_model = [
370
  score
@@ -381,36 +381,38 @@ async def main():
381
  for score in mlm_scores
382
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
383
  ]
384
- bleu = mean([s["bleu"] for s in translations_for_model])
385
- chrf = mean([s["chrf"] for s in translations_for_model])
386
- accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
387
- mlm = mean([s["chrf"] for s in mlm_for_model]) / 100
388
- overall_score = (bleu + accuracy + mlm) / 3
389
  if translations_for_model:
390
- results_for_language.append(
391
  {
392
  "model": model,
393
- "bleu": bleu,
394
- "chrf": chrf,
395
- "accuracy": accuracy,
396
- "mlm": mlm,
397
  "overall_score": overall_score,
398
  }
399
  )
400
- if results_for_language:
401
- results.append(
402
  {
403
  "language_name": language.language_name,
404
  "bcp_47": language.bcp_47,
405
  "speakers": language.speakers,
406
- "scores": results_for_language,
407
- "bleu": mean([s["bleu"] for s in results_for_language]),
408
- "chrf": mean([s["chrf"] for s in results_for_language]),
409
- "accuracy": mean([s["accuracy"] for s in results_for_language]),
410
- "mlm": mean([s["mlm"] for s in results_for_language]),
411
- "overall_score": mean(
412
- [s["overall_score"] for s in results_for_language]
413
  ),
 
 
 
 
414
  "commonvoice_hours": language.commonvoice_hours
415
  if not pd.isna(language.commonvoice_hours)
416
  else None,
@@ -421,7 +423,7 @@ async def main():
421
  }
422
  )
423
  with open("results.json", "w") as f:
424
- json.dump(results, f, indent=2, ensure_ascii=False)
425
 
426
 
427
  if __name__ == "__main__":
 
209
  return {
210
  "model": model,
211
  "bcp_47": original_language["bcp_47"],
212
+ "mt_bleu": bleu_score["bleu"],
213
+ "mt_chrf": chrf_score["score"],
214
  "sentence_nr": sentence_nr,
215
  }
216
 
 
316
  return {
317
  "model": model,
318
  "bcp_47": language["bcp_47"],
319
+ "mlm_chrf": chrf_score["score"],
320
  "sentence_nr": nr,
321
  }
322
 
 
352
  classification_scores = await tqdm_asyncio.gather(
353
  *classification_scores, miniters=1
354
  )
355
+ print("evaluate masked language modeling")
356
  mlm_scores = [
357
  mlm_and_evaluate(model, language.bcp_47, i)
358
  for i in range(n_sentences)
 
362
  and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
363
  ]
364
  mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
365
+ all_results = []
366
  for language in languages.itertuples():
367
+ results = []
368
  for model in models:
369
  translations_for_model = [
370
  score
 
381
  for score in mlm_scores
382
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
383
  ]
384
+ mt_bleu = mean([s["mt_bleu"] for s in translations_for_model])
385
+ mt_chrf = mean([s["mt_chrf"] for s in translations_for_model])
386
+ cls_acc = mean([s["true"] == s["pred"] for s in classifications_for_model])
387
+ mlm_chrf = mean([s["mlm_chrf"] for s in mlm_for_model])
388
+ overall_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
389
  if translations_for_model:
390
+ results.append(
391
  {
392
  "model": model,
393
+ "mt_bleu": mt_bleu,
394
+ "mt_chrf": mt_chrf,
395
+ "cls_acc": cls_acc,
396
+ "mlm_chrf": mlm_chrf,
397
  "overall_score": overall_score,
398
  }
399
  )
400
+ if results:
401
+ all_results.append(
402
  {
403
  "language_name": language.language_name,
404
  "bcp_47": language.bcp_47,
405
  "speakers": language.speakers,
406
+ "scores": results,
407
+ "mt_bleu": mean([s["mt_bleu"] for s in results]),
408
+ "mt_chrf": mean([s["mt_chrf"] for s in results]),
409
+ "cls_acc": mean(
410
+ [s["cls_acc"] for s in results]
 
 
411
  ),
412
+ "mlm_chrf": mean(
413
+ [s["mlm_chrf"] for s in results]
414
+ ),
415
+ "overall_score": mean([s["overall_score"] for s in results]),
416
  "commonvoice_hours": language.commonvoice_hours
417
  if not pd.isna(language.commonvoice_hours)
418
  else None,
 
423
  }
424
  )
425
  with open("results.json", "w") as f:
426
+ json.dump(all_results, f, indent=2, ensure_ascii=False)
427
 
428
 
429
  if __name__ == "__main__":
results.json CHANGED
@@ -6,50 +6,50 @@
6
  "scores": [
7
  {
8
  "model": "openai/gpt-4o-mini",
9
- "bleu": 0.89404322120213,
10
- "chrf": 92.53933977489264,
11
- "accuracy": 0.5666666666666667,
12
- "mlm": 0.9778605197038973,
13
- "overall_score": 0.8128568025242314
14
  },
15
  {
16
  "model": "meta-llama/llama-3.3-70b-instruct",
17
- "bleu": 0.4351349353198866,
18
- "chrf": 54.9504915580248,
19
- "accuracy": 0.6,
20
- "mlm": 0.9681484728467826,
21
- "overall_score": 0.6677611360555563
22
  },
23
  {
24
  "model": "mistralai/mistral-small-24b-instruct-2501",
25
- "bleu": 0.8800468872938262,
26
- "chrf": 94.30164664106223,
27
- "accuracy": 0.5333333333333333,
28
- "mlm": 0.804094099273989,
29
- "overall_score": 0.7391581066337162
30
  },
31
  {
32
  "model": "google/gemini-2.0-flash-001",
33
- "bleu": 0.8489646963773831,
34
- "chrf": 92.73129066280984,
35
- "accuracy": 0.8666666666666667,
36
- "mlm": 0.9770616407001859,
37
- "overall_score": 0.8975643345814119
38
  },
39
  {
40
  "model": "microsoft/phi-4",
41
- "bleu": 0.8230104823079876,
42
- "chrf": 91.69043412576788,
43
- "accuracy": 0.7,
44
- "mlm": 0.9632049588292643,
45
- "overall_score": 0.8287384803790839
46
  }
47
  ],
48
- "bleu": 0.7762400445002428,
49
- "chrf": 85.24264055251147,
50
- "accuracy": 0.6533333333333333,
51
- "mlm": 0.9380739382708239,
52
- "overall_score": 0.7892157720348,
53
  "commonvoice_hours": 2651.0,
54
  "commonvoice_locale": "en",
55
  "population": {
@@ -217,18 +217,18 @@
217
  "scores": [
218
  {
219
  "model": "meta-llama/llama-3.3-70b-instruct",
220
- "bleu": 0.3977775857451761,
221
- "chrf": 57.672913792439125,
222
- "accuracy": 0.5666666666666667,
223
- "mlm": 0.926731451729437,
224
- "overall_score": 0.6303919013804266
225
  }
226
  ],
227
- "bleu": 0.3977775857451761,
228
- "chrf": 57.672913792439125,
229
- "accuracy": 0.5666666666666667,
230
- "mlm": 0.926731451729437,
231
- "overall_score": 0.6303919013804266,
232
  "commonvoice_hours": 422.0,
233
  "commonvoice_locale": "zh-TW",
234
  "population": {
@@ -261,18 +261,18 @@
261
  "scores": [
262
  {
263
  "model": "meta-llama/llama-3.3-70b-instruct",
264
- "bleu": 0.333521621016373,
265
- "chrf": 50.48364584189306,
266
- "accuracy": 0.5,
267
- "mlm": 0.9585976421208252,
268
- "overall_score": 0.5973730877123994
269
  }
270
  ],
271
- "bleu": 0.333521621016373,
272
- "chrf": 50.48364584189306,
273
- "accuracy": 0.5,
274
- "mlm": 0.9585976421208252,
275
- "overall_score": 0.5973730877123994,
276
  "commonvoice_hours": 16.0,
277
  "commonvoice_locale": "hi-IN",
278
  "population": {
@@ -291,18 +291,18 @@
291
  "scores": [
292
  {
293
  "model": "meta-llama/llama-3.3-70b-instruct",
294
- "bleu": 0.29160032861883095,
295
- "chrf": 47.668399832701844,
296
- "accuracy": 0.5,
297
- "mlm": 0.9272973828072317,
298
- "overall_score": 0.5729659038086875
299
  }
300
  ],
301
- "bleu": 0.29160032861883095,
302
- "chrf": 47.668399832701844,
303
- "accuracy": 0.5,
304
- "mlm": 0.9272973828072317,
305
- "overall_score": 0.5729659038086875,
306
  "commonvoice_hours": 446.0,
307
  "commonvoice_locale": "es",
308
  "population": {
@@ -354,18 +354,18 @@
354
  "scores": [
355
  {
356
  "model": "meta-llama/llama-3.3-70b-instruct",
357
- "bleu": 0.277257629790728,
358
- "chrf": 46.62779335380641,
359
- "accuracy": 0.4666666666666667,
360
- "mlm": 0.9617481078420298,
361
- "overall_score": 0.5685574680998081
362
  }
363
  ],
364
- "bleu": 0.277257629790728,
365
- "chrf": 46.62779335380641,
366
- "accuracy": 0.4666666666666667,
367
- "mlm": 0.9617481078420298,
368
- "overall_score": 0.5685574680998081,
369
  "commonvoice_hours": 91.0,
370
  "commonvoice_locale": "ar",
371
  "population": {
@@ -416,18 +416,18 @@
416
  "scores": [
417
  {
418
  "model": "meta-llama/llama-3.3-70b-instruct",
419
- "bleu": 0.2659144372728079,
420
- "chrf": 44.14831240898717,
421
- "accuracy": 0.43333333333333335,
422
- "mlm": 0.9414677321132675,
423
- "overall_score": 0.5469051675731363
424
  }
425
  ],
426
- "bleu": 0.2659144372728079,
427
- "chrf": 44.14831240898717,
428
- "accuracy": 0.43333333333333335,
429
- "mlm": 0.9414677321132675,
430
- "overall_score": 0.5469051675731363,
431
  "commonvoice_hours": 77.0,
432
  "commonvoice_locale": "ur",
433
  "population": {
@@ -445,18 +445,18 @@
445
  "scores": [
446
  {
447
  "model": "meta-llama/llama-3.3-70b-instruct",
448
- "bleu": 0.315663773358301,
449
- "chrf": 49.253978669350964,
450
- "accuracy": 0.5666666666666667,
451
- "mlm": 0.960796739893282,
452
- "overall_score": 0.6143757266394165
453
  }
454
  ],
455
- "bleu": 0.315663773358301,
456
- "chrf": 49.253978669350964,
457
- "accuracy": 0.5666666666666667,
458
- "mlm": 0.960796739893282,
459
- "overall_score": 0.6143757266394165,
460
  "commonvoice_hours": 1052.0,
461
  "commonvoice_locale": "fr",
462
  "population": {
@@ -531,18 +531,18 @@
531
  "scores": [
532
  {
533
  "model": "meta-llama/llama-3.3-70b-instruct",
534
- "bleu": 0.21265887286151353,
535
- "chrf": 41.501657722373686,
536
- "accuracy": 0.4,
537
- "mlm": 0.8995272489886615,
538
- "overall_score": 0.504062040616725
539
  }
540
  ],
541
- "bleu": 0.21265887286151353,
542
- "chrf": 41.501657722373686,
543
- "accuracy": 0.4,
544
- "mlm": 0.8995272489886615,
545
- "overall_score": 0.504062040616725,
546
  "commonvoice_hours": 49.0,
547
  "commonvoice_locale": "bn",
548
  "population": {
@@ -560,18 +560,18 @@
560
  "scores": [
561
  {
562
  "model": "meta-llama/llama-3.3-70b-instruct",
563
- "bleu": 0.27514792195783394,
564
- "chrf": 45.901248962808694,
565
- "accuracy": 0.5666666666666667,
566
- "mlm": 0.9640739007405215,
567
- "overall_score": 0.6019628297883407
568
  }
569
  ],
570
- "bleu": 0.27514792195783394,
571
- "chrf": 45.901248962808694,
572
- "accuracy": 0.5666666666666667,
573
- "mlm": 0.9640739007405215,
574
- "overall_score": 0.6019628297883407,
575
  "commonvoice_hours": 177.0,
576
  "commonvoice_locale": "pt",
577
  "population": {
@@ -600,18 +600,18 @@
600
  "scores": [
601
  {
602
  "model": "meta-llama/llama-3.3-70b-instruct",
603
- "bleu": 0.3048037308116852,
604
- "chrf": 48.4304965568793,
605
- "accuracy": 0.5333333333333333,
606
- "mlm": 0.9033444436966103,
607
- "overall_score": 0.5804938359472096
608
  }
609
  ],
610
- "bleu": 0.3048037308116852,
611
- "chrf": 48.4304965568793,
612
- "accuracy": 0.5333333333333333,
613
- "mlm": 0.9033444436966103,
614
- "overall_score": 0.5804938359472096,
615
  "commonvoice_hours": 2.3,
616
  "commonvoice_locale": "pa-IN",
617
  "population": {
@@ -630,18 +630,18 @@
630
  "scores": [
631
  {
632
  "model": "meta-llama/llama-3.3-70b-instruct",
633
- "bleu": 0.26108507692625094,
634
- "chrf": 45.063308940468154,
635
- "accuracy": 0.5666666666666667,
636
- "mlm": 0.9563400339874765,
637
- "overall_score": 0.5946972591934646
638
  }
639
  ],
640
- "bleu": 0.26108507692625094,
641
- "chrf": 45.063308940468154,
642
- "accuracy": 0.5666666666666667,
643
- "mlm": 0.9563400339874765,
644
- "overall_score": 0.5946972591934646,
645
  "commonvoice_hours": 242.0,
646
  "commonvoice_locale": "ru",
647
  "population": {
@@ -677,18 +677,18 @@
677
  "scores": [
678
  {
679
  "model": "meta-llama/llama-3.3-70b-instruct",
680
- "bleu": 0.2709203338132304,
681
- "chrf": 44.36399636969686,
682
- "accuracy": 0.5,
683
- "mlm": 0.9612351448314987,
684
- "overall_score": 0.5773851595482431
685
  }
686
  ],
687
- "bleu": 0.2709203338132304,
688
- "chrf": 44.36399636969686,
689
- "accuracy": 0.5,
690
- "mlm": 0.9612351448314987,
691
- "overall_score": 0.5773851595482431,
692
  "commonvoice_hours": 411.0,
693
  "commonvoice_locale": "sw",
694
  "population": {
@@ -710,18 +710,18 @@
710
  "scores": [
711
  {
712
  "model": "meta-llama/llama-3.3-70b-instruct",
713
- "bleu": 0.27441353638286026,
714
- "chrf": 46.025445629112156,
715
- "accuracy": 0.6,
716
- "mlm": 0.9465444909745621,
717
- "overall_score": 0.6069860091191407
718
  }
719
  ],
720
- "bleu": 0.27441353638286026,
721
- "chrf": 46.025445629112156,
722
- "accuracy": 0.6,
723
- "mlm": 0.9465444909745621,
724
- "overall_score": 0.6069860091191407,
725
  "commonvoice_hours": 33.0,
726
  "commonvoice_locale": "id",
727
  "population": {
@@ -736,18 +736,18 @@
736
  "scores": [
737
  {
738
  "model": "meta-llama/llama-3.3-70b-instruct",
739
- "bleu": 0.3338682761061998,
740
- "chrf": 50.216731068308064,
741
- "accuracy": 0.5666666666666667,
742
- "mlm": 0.9526738506105953,
743
- "overall_score": 0.6177362644611538
744
  }
745
  ],
746
- "bleu": 0.3338682761061998,
747
- "chrf": 50.216731068308064,
748
- "accuracy": 0.5666666666666667,
749
- "mlm": 0.9526738506105953,
750
- "overall_score": 0.6177362644611538,
751
  "commonvoice_hours": 1358.0,
752
  "commonvoice_locale": "de",
753
  "population": {
@@ -787,18 +787,18 @@
787
  "scores": [
788
  {
789
  "model": "meta-llama/llama-3.3-70b-instruct",
790
- "bleu": 0.2940100667664714,
791
- "chrf": 46.403097021492236,
792
- "accuracy": 0.6,
793
- "mlm": 0.9337910001211718,
794
- "overall_score": 0.609267022295881
795
  }
796
  ],
797
- "bleu": 0.2940100667664714,
798
- "chrf": 46.403097021492236,
799
- "accuracy": 0.6,
800
- "mlm": 0.9337910001211718,
801
- "overall_score": 0.609267022295881,
802
  "commonvoice_hours": 222.0,
803
  "commonvoice_locale": "ja",
804
  "population": {
@@ -814,18 +814,18 @@
814
  "scores": [
815
  {
816
  "model": "meta-llama/llama-3.3-70b-instruct",
817
- "bleu": 0.2750887189010237,
818
- "chrf": 46.31463752811596,
819
- "accuracy": 0.4,
820
- "mlm": 0.9359077032699009,
821
- "overall_score": 0.5369988073903081
822
  }
823
  ],
824
- "bleu": 0.2750887189010237,
825
- "chrf": 46.31463752811596,
826
- "accuracy": 0.4,
827
- "mlm": 0.9359077032699009,
828
- "overall_score": 0.5369988073903081,
829
  "commonvoice_hours": 0.3,
830
  "commonvoice_locale": "te",
831
  "population": {
@@ -839,18 +839,18 @@
839
  "scores": [
840
  {
841
  "model": "meta-llama/llama-3.3-70b-instruct",
842
- "bleu": 0.2584800238292114,
843
- "chrf": 44.69889855306244,
844
- "accuracy": 0.5666666666666667,
845
- "mlm": 0.9351731522339883,
846
- "overall_score": 0.5867732809099554
847
  }
848
  ],
849
- "bleu": 0.2584800238292114,
850
- "chrf": 44.69889855306244,
851
- "accuracy": 0.5666666666666667,
852
- "mlm": 0.9351731522339883,
853
- "overall_score": 0.5867732809099554,
854
  "commonvoice_hours": 20.0,
855
  "commonvoice_locale": "mr",
856
  "population": {
@@ -864,18 +864,18 @@
864
  "scores": [
865
  {
866
  "model": "meta-llama/llama-3.3-70b-instruct",
867
- "bleu": 0.23082586428104943,
868
- "chrf": 41.42591471734489,
869
- "accuracy": 0.4666666666666667,
870
- "mlm": 0.9453687616674971,
871
- "overall_score": 0.5476204308717377
872
  }
873
  ],
874
- "bleu": 0.23082586428104943,
875
- "chrf": 41.42591471734489,
876
- "accuracy": 0.4666666666666667,
877
- "mlm": 0.9453687616674971,
878
- "overall_score": 0.5476204308717377,
879
  "commonvoice_hours": 0.0,
880
  "commonvoice_locale": "jv",
881
  "population": {
@@ -890,18 +890,18 @@
890
  "scores": [
891
  {
892
  "model": "meta-llama/llama-3.3-70b-instruct",
893
- "bleu": 0.252552287345529,
894
- "chrf": 43.351007120897606,
895
- "accuracy": 0.5333333333333333,
896
- "mlm": 0.9638175194388952,
897
- "overall_score": 0.5832343800392524
898
  }
899
  ],
900
- "bleu": 0.252552287345529,
901
- "chrf": 43.351007120897606,
902
- "accuracy": 0.5333333333333333,
903
- "mlm": 0.9638175194388952,
904
- "overall_score": 0.5832343800392524,
905
  "commonvoice_hours": 5.9,
906
  "commonvoice_locale": "vi",
907
  "population": {
 
6
  "scores": [
7
  {
8
  "model": "openai/gpt-4o-mini",
9
+ "mt_bleu": 0.5245466124037277,
10
+ "mt_chrf": 65.25187717981981,
11
+ "cls_acc": 0.5666666666666667,
12
+ "mlm_chrf": 97.84704595784264,
13
+ "overall_score": 0.7325519660144305
14
  },
15
  {
16
  "model": "meta-llama/llama-3.3-70b-instruct",
17
+ "mt_bleu": 0.48750797044187216,
18
+ "mt_chrf": 63.24229348441665,
19
+ "cls_acc": 0.6,
20
+ "mlm_chrf": 93.62602669879945,
21
+ "overall_score": 0.7228944006107203
22
  },
23
  {
24
  "model": "mistralai/mistral-small-24b-instruct-2501",
25
+ "mt_bleu": 0.486501959595472,
26
+ "mt_chrf": 63.8187259254881,
27
+ "cls_acc": 0.5333333333333333,
28
+ "mlm_chrf": 79.91140615317198,
29
+ "overall_score": 0.656878218039978
30
  },
31
  {
32
  "model": "google/gemini-2.0-flash-001",
33
+ "mt_bleu": 0.6060954569411976,
34
+ "mt_chrf": 71.2288943066563,
35
+ "cls_acc": 0.8666666666666667,
36
+ "mlm_chrf": 98.79868693366329,
37
+ "overall_score": 0.8556474930232877
38
  },
39
  {
40
  "model": "microsoft/phi-4",
41
+ "mt_bleu": 0.5199836121545649,
42
+ "mt_chrf": 66.05410510011644,
43
+ "cls_acc": 0.7,
44
+ "mlm_chrf": 97.2290729316734,
45
+ "overall_score": 0.7776105934392995
46
  }
47
  ],
48
+ "mt_bleu": 0.5249271223073668,
49
+ "mt_chrf": 65.91917919929946,
50
+ "cls_acc": 0.6533333333333333,
51
+ "mlm_chrf": 93.48244773503015,
52
+ "overall_score": 0.7491165342255433,
53
  "commonvoice_hours": 2651.0,
54
  "commonvoice_locale": "en",
55
  "population": {
 
217
  "scores": [
218
  {
219
  "model": "meta-llama/llama-3.3-70b-instruct",
220
+ "mt_bleu": 0.38557580495281013,
221
+ "mt_chrf": 61.11151378837755,
222
+ "cls_acc": 0.5666666666666667,
223
+ "mlm_chrf": 94.55849047452216,
224
+ "overall_score": 0.7077889030985546
225
  }
226
  ],
227
+ "mt_bleu": 0.38557580495281013,
228
+ "mt_chrf": 61.11151378837755,
229
+ "cls_acc": 0.5666666666666667,
230
+ "mlm_chrf": 94.55849047452216,
231
+ "overall_score": 0.7077889030985546,
232
  "commonvoice_hours": 422.0,
233
  "commonvoice_locale": "zh-TW",
234
  "population": {
 
261
  "scores": [
262
  {
263
  "model": "meta-llama/llama-3.3-70b-instruct",
264
+ "mt_bleu": 0.32404902340686065,
265
+ "mt_chrf": 53.54085104449268,
266
+ "cls_acc": 0.5,
267
+ "mlm_chrf": 96.17240172798218,
268
+ "overall_score": 0.6657108425749162
269
  }
270
  ],
271
+ "mt_bleu": 0.32404902340686065,
272
+ "mt_chrf": 53.54085104449268,
273
+ "cls_acc": 0.5,
274
+ "mlm_chrf": 96.17240172798218,
275
+ "overall_score": 0.6657108425749162,
276
  "commonvoice_hours": 16.0,
277
  "commonvoice_locale": "hi-IN",
278
  "population": {
 
291
  "scores": [
292
  {
293
  "model": "meta-llama/llama-3.3-70b-instruct",
294
+ "mt_bleu": 0.31587937116142056,
295
+ "mt_chrf": 52.142851262301726,
296
+ "cls_acc": 0.5,
297
+ "mlm_chrf": 96.92768852306384,
298
+ "overall_score": 0.6635684659512185
299
  }
300
  ],
301
+ "mt_bleu": 0.31587937116142056,
302
+ "mt_chrf": 52.142851262301726,
303
+ "cls_acc": 0.5,
304
+ "mlm_chrf": 96.92768852306384,
305
+ "overall_score": 0.6635684659512185,
306
  "commonvoice_hours": 446.0,
307
  "commonvoice_locale": "es",
308
  "population": {
 
354
  "scores": [
355
  {
356
  "model": "meta-llama/llama-3.3-70b-instruct",
357
+ "mt_bleu": 0.39547934933771334,
358
+ "mt_chrf": 57.51652731936118,
359
+ "cls_acc": 0.4666666666666667,
360
+ "mlm_chrf": 94.97026443937914,
361
+ "overall_score": 0.6638448614180232
362
  }
363
  ],
364
+ "mt_bleu": 0.39547934933771334,
365
+ "mt_chrf": 57.51652731936118,
366
+ "cls_acc": 0.4666666666666667,
367
+ "mlm_chrf": 94.97026443937914,
368
+ "overall_score": 0.6638448614180232,
369
  "commonvoice_hours": 91.0,
370
  "commonvoice_locale": "ar",
371
  "population": {
 
416
  "scores": [
417
  {
418
  "model": "meta-llama/llama-3.3-70b-instruct",
419
+ "mt_bleu": 0.26585004461425726,
420
+ "mt_chrf": 47.37157150967947,
421
+ "cls_acc": 0.43333333333333335,
422
+ "mlm_chrf": 94.38802161979918,
423
+ "overall_score": 0.6169764215427066
424
  }
425
  ],
426
+ "mt_bleu": 0.26585004461425726,
427
+ "mt_chrf": 47.37157150967947,
428
+ "cls_acc": 0.43333333333333335,
429
+ "mlm_chrf": 94.38802161979918,
430
+ "overall_score": 0.6169764215427066,
431
  "commonvoice_hours": 77.0,
432
  "commonvoice_locale": "ur",
433
  "population": {
 
445
  "scores": [
446
  {
447
  "model": "meta-llama/llama-3.3-70b-instruct",
448
+ "mt_bleu": 0.3510210872150948,
449
+ "mt_chrf": 55.795595938804894,
450
+ "cls_acc": 0.5666666666666667,
451
+ "mlm_chrf": 97.12318847922649,
452
+ "overall_score": 0.6986181702823268
453
  }
454
  ],
455
+ "mt_bleu": 0.3510210872150948,
456
+ "mt_chrf": 55.795595938804894,
457
+ "cls_acc": 0.5666666666666667,
458
+ "mlm_chrf": 97.12318847922649,
459
+ "overall_score": 0.6986181702823268,
460
  "commonvoice_hours": 1052.0,
461
  "commonvoice_locale": "fr",
462
  "population": {
 
531
  "scores": [
532
  {
533
  "model": "meta-llama/llama-3.3-70b-instruct",
534
+ "mt_bleu": 0.2874920154082786,
535
+ "mt_chrf": 51.49279116112809,
536
+ "cls_acc": 0.4,
537
+ "mlm_chrf": 90.6067262108039,
538
+ "overall_score": 0.6069983912397733
539
  }
540
  ],
541
+ "mt_bleu": 0.2874920154082786,
542
+ "mt_chrf": 51.49279116112809,
543
+ "cls_acc": 0.4,
544
+ "mlm_chrf": 90.6067262108039,
545
+ "overall_score": 0.6069983912397733,
546
  "commonvoice_hours": 49.0,
547
  "commonvoice_locale": "bn",
548
  "population": {
 
560
  "scores": [
561
  {
562
  "model": "meta-llama/llama-3.3-70b-instruct",
563
+ "mt_bleu": 0.33491649454450034,
564
+ "mt_chrf": 54.60211868234021,
565
+ "cls_acc": 0.5666666666666667,
566
+ "mlm_chrf": 96.52676764996336,
567
+ "overall_score": 0.6926518433299008
568
  }
569
  ],
570
+ "mt_bleu": 0.33491649454450034,
571
+ "mt_chrf": 54.60211868234021,
572
+ "cls_acc": 0.5666666666666667,
573
+ "mlm_chrf": 96.52676764996336,
574
+ "overall_score": 0.6926518433299008,
575
  "commonvoice_hours": 177.0,
576
  "commonvoice_locale": "pt",
577
  "population": {
 
600
  "scores": [
601
  {
602
  "model": "meta-llama/llama-3.3-70b-instruct",
603
+ "mt_bleu": 0.3078917767345886,
604
+ "mt_chrf": 50.505686987696365,
605
+ "cls_acc": 0.5333333333333333,
606
+ "mlm_chrf": 90.10119297923285,
607
+ "overall_score": 0.6464673776675418
608
  }
609
  ],
610
+ "mt_bleu": 0.3078917767345886,
611
+ "mt_chrf": 50.505686987696365,
612
+ "cls_acc": 0.5333333333333333,
613
+ "mlm_chrf": 90.10119297923285,
614
+ "overall_score": 0.6464673776675418,
615
  "commonvoice_hours": 2.3,
616
  "commonvoice_locale": "pa-IN",
617
  "population": {
 
630
  "scores": [
631
  {
632
  "model": "meta-llama/llama-3.3-70b-instruct",
633
+ "mt_bleu": 0.32647288591882895,
634
+ "mt_chrf": 53.107657805277526,
635
+ "cls_acc": 0.5666666666666667,
636
+ "mlm_chrf": 96.21400287169976,
637
+ "overall_score": 0.6866277578121466
638
  }
639
  ],
640
+ "mt_bleu": 0.32647288591882895,
641
+ "mt_chrf": 53.107657805277526,
642
+ "cls_acc": 0.5666666666666667,
643
+ "mlm_chrf": 96.21400287169976,
644
+ "overall_score": 0.6866277578121466,
645
  "commonvoice_hours": 242.0,
646
  "commonvoice_locale": "ru",
647
  "population": {
 
677
  "scores": [
678
  {
679
  "model": "meta-llama/llama-3.3-70b-instruct",
680
+ "mt_bleu": 0.29267168415814176,
681
+ "mt_chrf": 49.16720485265401,
682
+ "cls_acc": 0.5,
683
+ "mlm_chrf": 94.82776161604177,
684
+ "overall_score": 0.646649888228986
685
  }
686
  ],
687
+ "mt_bleu": 0.29267168415814176,
688
+ "mt_chrf": 49.16720485265401,
689
+ "cls_acc": 0.5,
690
+ "mlm_chrf": 94.82776161604177,
691
+ "overall_score": 0.646649888228986,
692
  "commonvoice_hours": 411.0,
693
  "commonvoice_locale": "sw",
694
  "population": {
 
710
  "scores": [
711
  {
712
  "model": "meta-llama/llama-3.3-70b-instruct",
713
+ "mt_bleu": 0.30782604302717903,
714
+ "mt_chrf": 52.62467814017025,
715
+ "cls_acc": 0.6,
716
+ "mlm_chrf": 95.83373661382923,
717
+ "overall_score": 0.6948613825133316
718
  }
719
  ],
720
+ "mt_bleu": 0.30782604302717903,
721
+ "mt_chrf": 52.62467814017025,
722
+ "cls_acc": 0.6,
723
+ "mlm_chrf": 95.83373661382923,
724
+ "overall_score": 0.6948613825133316,
725
  "commonvoice_hours": 33.0,
726
  "commonvoice_locale": "id",
727
  "population": {
 
736
  "scores": [
737
  {
738
  "model": "meta-llama/llama-3.3-70b-instruct",
739
+ "mt_bleu": 0.3880450110946665,
740
+ "mt_chrf": 57.659717194572515,
741
+ "cls_acc": 0.5666666666666667,
742
+ "mlm_chrf": 96.78268690494019,
743
+ "overall_score": 0.7036969025539311
744
  }
745
  ],
746
+ "mt_bleu": 0.3880450110946665,
747
+ "mt_chrf": 57.659717194572515,
748
+ "cls_acc": 0.5666666666666667,
749
+ "mlm_chrf": 96.78268690494019,
750
+ "overall_score": 0.7036969025539311,
751
  "commonvoice_hours": 1358.0,
752
  "commonvoice_locale": "de",
753
  "population": {
 
787
  "scores": [
788
  {
789
  "model": "meta-llama/llama-3.3-70b-instruct",
790
+ "mt_bleu": 0.2478415746367755,
791
+ "mt_chrf": 47.19744231900874,
792
+ "cls_acc": 0.6,
793
+ "mlm_chrf": 92.47052714876749,
794
+ "overall_score": 0.6655598982259208
795
  }
796
  ],
797
+ "mt_bleu": 0.2478415746367755,
798
+ "mt_chrf": 47.19744231900874,
799
+ "cls_acc": 0.6,
800
+ "mlm_chrf": 92.47052714876749,
801
+ "overall_score": 0.6655598982259208,
802
  "commonvoice_hours": 222.0,
803
  "commonvoice_locale": "ja",
804
  "population": {
 
814
  "scores": [
815
  {
816
  "model": "meta-llama/llama-3.3-70b-instruct",
817
+ "mt_bleu": 0.3785489421990512,
818
+ "mt_chrf": 56.9267557487146,
819
+ "cls_acc": 0.4,
820
+ "mlm_chrf": 94.3625059002704,
821
+ "overall_score": 0.6376308721632834
822
  }
823
  ],
824
+ "mt_bleu": 0.3785489421990512,
825
+ "mt_chrf": 56.9267557487146,
826
+ "cls_acc": 0.4,
827
+ "mlm_chrf": 94.3625059002704,
828
+ "overall_score": 0.6376308721632834,
829
  "commonvoice_hours": 0.3,
830
  "commonvoice_locale": "te",
831
  "population": {
 
839
  "scores": [
840
  {
841
  "model": "meta-llama/llama-3.3-70b-instruct",
842
+ "mt_bleu": 0.29576799752528954,
843
+ "mt_chrf": 51.55512571221437,
844
+ "cls_acc": 0.5666666666666667,
845
+ "mlm_chrf": 95.03930657100632,
846
+ "overall_score": 0.6775369964996245
847
  }
848
  ],
849
+ "mt_bleu": 0.29576799752528954,
850
+ "mt_chrf": 51.55512571221437,
851
+ "cls_acc": 0.5666666666666667,
852
+ "mlm_chrf": 95.03930657100632,
853
+ "overall_score": 0.6775369964996245,
854
  "commonvoice_hours": 20.0,
855
  "commonvoice_locale": "mr",
856
  "population": {
 
864
  "scores": [
865
  {
866
  "model": "meta-llama/llama-3.3-70b-instruct",
867
+ "mt_bleu": 0.26767127029757953,
868
+ "mt_chrf": 48.9335568346396,
869
+ "cls_acc": 0.4666666666666667,
870
+ "mlm_chrf": 91.68807278010077,
871
+ "overall_score": 0.6242943209380235
872
  }
873
  ],
874
+ "mt_bleu": 0.26767127029757953,
875
+ "mt_chrf": 48.9335568346396,
876
+ "cls_acc": 0.4666666666666667,
877
+ "mlm_chrf": 91.68807278010077,
878
+ "overall_score": 0.6242943209380235,
879
  "commonvoice_hours": 0.0,
880
  "commonvoice_locale": "jv",
881
  "population": {
 
890
  "scores": [
891
  {
892
  "model": "meta-llama/llama-3.3-70b-instruct",
893
+ "mt_bleu": 0.26736329890789995,
894
+ "mt_chrf": 49.52763533189073,
895
+ "cls_acc": 0.5333333333333333,
896
+ "mlm_chrf": 94.33244905535389,
897
+ "overall_score": 0.6573113924019266
898
  }
899
  ],
900
+ "mt_bleu": 0.26736329890789995,
901
+ "mt_chrf": 49.52763533189073,
902
+ "cls_acc": 0.5333333333333333,
903
+ "mlm_chrf": 94.33244905535389,
904
+ "overall_score": 0.6573113924019266,
905
  "commonvoice_hours": 5.9,
906
  "commonvoice_locale": "vi",
907
  "population": {