David Pomerenke commited on
Commit
e9a19be
·
1 Parent(s): 040dc35

Separate overall scores for T2T / S2T

Browse files
Files changed (3) hide show
  1. app.py +60 -37
  2. evals.py +5 -4
  3. results.json +0 -0
app.py CHANGED
@@ -8,20 +8,24 @@ import plotly.graph_objects as go
8
  import pycountry
9
 
10
  with open("results.json") as f:
11
- results = json.load(f)
 
 
 
 
12
 
13
  # Global constants for metric mappings
14
- METRICS = {
15
- "overall_performance": {
16
- "display_name": "Overall Performance",
17
- "field_name": "overall_score",
18
- "label": "Overall Performance Score",
19
  "explanation": """
20
- **Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
21
  Higher scores indicate better overall language capabilities.
22
  """,
23
  },
24
- "translation_bleu": {
25
  "display_name": "Translation (BLEU)",
26
  "field_name": "mt_bleu",
27
  "label": "BLEU Score",
@@ -30,7 +34,7 @@ METRICS = {
30
  It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
31
  """,
32
  },
33
- "translation_chrf": {
34
  "display_name": "Translation (ChrF)",
35
  "field_name": "mt_chrf",
36
  "label": "ChrF Score",
@@ -40,7 +44,7 @@ METRICS = {
40
  Higher scores (0-1) indicate better translations.
41
  """,
42
  },
43
- "classification_accuracy": {
44
  "display_name": "Classification (Accuracy)",
45
  "field_name": "cls_acc",
46
  "label": "Classification Accuracy",
@@ -50,7 +54,7 @@ METRICS = {
50
  Reported as a percentage where higher values indicate better classification performance.
51
  """,
52
  },
53
- "mlm_chrf": {
54
  "display_name": "Masked Language Modeling (ChrF)",
55
  "field_name": "mlm_chrf",
56
  "label": "MLM ChrF Score",
@@ -60,7 +64,16 @@ METRICS = {
60
  between predicted and actual text. Higher scores indicate better language understanding.
61
  """,
62
  },
63
- "asr_wer": {
 
 
 
 
 
 
 
 
 
64
  "display_name": "Automatic Speech Recognition (WER)",
65
  "field_name": "asr_wer",
66
  "label": "WER",
@@ -71,7 +84,7 @@ METRICS = {
71
  Lower scores indicate better performance, with 0 being perfect transcription.
72
  """,
73
  },
74
- "asr_chrf": {
75
  "display_name": "Automatic Speech Recognition ChrF",
76
  "field_name": "asr_chrf",
77
  "label": "ChrF",
@@ -80,8 +93,8 @@ METRICS = {
80
  This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
81
  Higher scores (0-1) indicate better translations.
82
  """,
83
- },
84
- }
85
 
86
 
87
  def mean(lst):
@@ -91,7 +104,7 @@ def mean(lst):
91
  def create_leaderboard_df(metric):
92
  # Sort languages by average BLEU to determine resource categories
93
  langs_with_score = [
94
- lang for lang in results if lang[metric["field_name"]] is not None
95
  ]
96
  sorted_langs = sorted(
97
  langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
@@ -106,7 +119,7 @@ def create_leaderboard_df(metric):
106
 
107
  # Get all model scores with categorization
108
  model_scores = {}
109
- for lang in results:
110
  category = (
111
  "High-Resource"
112
  if lang["language_name"] in high_resource
@@ -205,7 +218,7 @@ def create_leaderboard_df(metric):
205
 
206
 
207
  def create_model_comparison_plot(metric):
208
- top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]
209
 
210
  # Create appropriate title and y-axis label based on metric
211
  title = f"{metric['display_name']} by Model and Language"
@@ -251,14 +264,14 @@ def create_language_stats_df(metric):
251
  # Create a list to store flattened data
252
  flat_data = []
253
 
254
- for lang in results:
255
  # Find the best model and its BLEU score
256
  best_model = max(
257
- lang["scores"] or [{"overall_score": None, "model": None}],
258
- key=lambda x: x["overall_score"],
259
- )
260
 
261
- model = best_model["model"]
262
  model_name = model.split("/")[-1] if model else "N/A"
263
  model_link = (
264
  f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
@@ -274,9 +287,9 @@ def create_language_stats_df(metric):
274
  "Language": f"**{lang['language_name']}**",
275
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
276
  # "Models Tested": len(lang["scores"]),
277
- "Overall": round(lang["overall_score"], 3)
278
- if lang["overall_score"] is not None
279
- else "N/A",
280
  "Translation": round(lang["mt_bleu"], 3)
281
  if lang["mt_bleu"] is not None
282
  else "N/A",
@@ -286,9 +299,7 @@ def create_language_stats_df(metric):
286
  "MLM": round(lang["mlm_chrf"], 3)
287
  if lang["mlm_chrf"] is not None
288
  else "N/A",
289
- "ASR": round(lang["asr_wer"], 3)
290
- if lang["asr_wer"] is not None
291
- else "N/A",
292
  "Best Model": model_link,
293
  "CommonVoice Hours": commonvoice_link,
294
  }
@@ -296,9 +307,22 @@ def create_language_stats_df(metric):
296
 
297
  df = pd.DataFrame(flat_data)
298
  return gr.DataFrame(
299
- value=df,
300
  label="Language Results",
301
  show_search="search",
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  datatype=[
303
  "markdown", # Language
304
  "number", # Speakers
@@ -316,7 +340,7 @@ def create_language_stats_df(metric):
316
 
317
  def create_scatter_plot(metric):
318
  # Filter results to include only languages with sufficient speakers
319
- filtered_results = [lang for lang in results if lang["speakers"] >= 10_000]
320
 
321
  # Create a list to store data for the scatter plot
322
  scatter_data = []
@@ -434,7 +458,7 @@ def create_world_map(metric):
434
  # Collect all country data
435
  population_data = get_population_data()
436
  country_data = {}
437
- for lang in results:
438
  # Skip languages without the required data
439
  if "population" not in lang or lang[metric["field_name"]] is None:
440
  continue
@@ -585,10 +609,10 @@ def create_metric_explanation(metric):
585
  with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
586
  gr.Markdown("# AI Language Proficiency Benchmark")
587
  gr.Markdown("Comparing language proficiency across different models and languages.")
588
- start_metric = METRICS["overall_performance"]
589
 
590
  metric = gr.Dropdown(
591
- choices=[metric_info["display_name"] for metric_info in METRICS.values()],
592
  value=start_metric["display_name"],
593
  label="Select Metric",
594
  interactive=True,
@@ -596,7 +620,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
596
  metric_explanation = create_metric_explanation(start_metric)
597
 
598
  gr.Markdown("## Model Comparison")
599
- create_leaderboard_df(start_metric)
600
  model_comparison_plot = gr.Plot(
601
  value=create_model_comparison_plot(start_metric),
602
  label="Model Comparison",
@@ -652,10 +676,9 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
652
  )
653
 
654
  def update_component(fn, metric_choice):
655
- metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
656
  return fn(metric)
657
 
658
-
659
  metric.change(
660
  fn=partial(update_component, create_metric_explanation),
661
  inputs=metric,
 
8
  import pycountry
9
 
10
  with open("results.json") as f:
11
+ languages = json.load(f)
12
+
13
+ languages_with_scores = [
14
+ lang for lang in languages if lang["t2t_score"] is not None
15
+ ]
16
 
17
  # Global constants for metric mappings
18
+ METRICS = [
19
+ {
20
+ "display_name": "Overall Text-to-Text Performance",
21
+ "field_name": "t2t_score",
22
+ "label": "Overall Score",
23
  "explanation": """
24
+ **Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
25
  Higher scores indicate better overall language capabilities.
26
  """,
27
  },
28
+ {
29
  "display_name": "Translation (BLEU)",
30
  "field_name": "mt_bleu",
31
  "label": "BLEU Score",
 
34
  It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
35
  """,
36
  },
37
+ {
38
  "display_name": "Translation (ChrF)",
39
  "field_name": "mt_chrf",
40
  "label": "ChrF Score",
 
44
  Higher scores (0-1) indicate better translations.
45
  """,
46
  },
47
+ {
48
  "display_name": "Classification (Accuracy)",
49
  "field_name": "cls_acc",
50
  "label": "Classification Accuracy",
 
54
  Reported as a percentage where higher values indicate better classification performance.
55
  """,
56
  },
57
+ {
58
  "display_name": "Masked Language Modeling (ChrF)",
59
  "field_name": "mlm_chrf",
60
  "label": "MLM ChrF Score",
 
64
  between predicted and actual text. Higher scores indicate better language understanding.
65
  """,
66
  },
67
+ {
68
+ "display_name": "Overall Speech-to-Text Performance",
69
+ "field_name": "s2t_score",
70
+ "label": "Overall Score",
71
+ "explanation": """
72
+ **Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
73
+ Higher scores indicate better overall language capabilities.
74
+ """,
75
+ },
76
+ {
77
  "display_name": "Automatic Speech Recognition (WER)",
78
  "field_name": "asr_wer",
79
  "label": "WER",
 
84
  Lower scores indicate better performance, with 0 being perfect transcription.
85
  """,
86
  },
87
+ {
88
  "display_name": "Automatic Speech Recognition ChrF",
89
  "field_name": "asr_chrf",
90
  "label": "ChrF",
 
93
  This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
94
  Higher scores (0-1) indicate better translations.
95
  """,
96
+ },
97
+ ]
98
 
99
 
100
  def mean(lst):
 
104
  def create_leaderboard_df(metric):
105
  # Sort languages by average BLEU to determine resource categories
106
  langs_with_score = [
107
+ lang for lang in languages_with_scores if lang[metric["field_name"]] is not None
108
  ]
109
  sorted_langs = sorted(
110
  langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
 
119
 
120
  # Get all model scores with categorization
121
  model_scores = {}
122
+ for lang in languages_with_scores:
123
  category = (
124
  "High-Resource"
125
  if lang["language_name"] in high_resource
 
218
 
219
 
220
  def create_model_comparison_plot(metric):
221
+ top_languages = sorted(languages_with_scores, key=lambda x: x["speakers"], reverse=True)[:10]
222
 
223
  # Create appropriate title and y-axis label based on metric
224
  title = f"{metric['display_name']} by Model and Language"
 
264
  # Create a list to store flattened data
265
  flat_data = []
266
 
267
+ for lang in languages:
268
  # Find the best model and its BLEU score
269
  best_model = max(
270
+ lang["scores"] or [{"t2t_score": None, "model": None}],
271
+ key=lambda x: x.get("t2t_score", 0),
272
+ ) if lang["t2t_score"] is not None else None
273
 
274
+ model = best_model["model"] if best_model else None
275
  model_name = model.split("/")[-1] if model else "N/A"
276
  model_link = (
277
  f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
 
287
  "Language": f"**{lang['language_name']}**",
288
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
289
  # "Models Tested": len(lang["scores"]),
290
+ # "Overall": round(lang["overall_score"], 3)
291
+ # if lang["overall_score"] is not None
292
+ # else "N/A",
293
  "Translation": round(lang["mt_bleu"], 3)
294
  if lang["mt_bleu"] is not None
295
  else "N/A",
 
299
  "MLM": round(lang["mlm_chrf"], 3)
300
  if lang["mlm_chrf"] is not None
301
  else "N/A",
302
+ "ASR": round(lang["asr_wer"], 3) if lang["asr_wer"] is not None else "N/A",
 
 
303
  "Best Model": model_link,
304
  "CommonVoice Hours": commonvoice_link,
305
  }
 
307
 
308
  df = pd.DataFrame(flat_data)
309
  return gr.DataFrame(
310
+ value=df,
311
  label="Language Results",
312
  show_search="search",
313
+ pinned_columns=1,
314
+ column_widths=[
315
+ "100px",
316
+ "100px",
317
+ "100px",
318
+ "100px",
319
+ "100px",
320
+ "100px",
321
+ "100px",
322
+ "100px",
323
+ "100px",
324
+ "100px",
325
+ ],
326
  datatype=[
327
  "markdown", # Language
328
  "number", # Speakers
 
340
 
341
  def create_scatter_plot(metric):
342
  # Filter results to include only languages with sufficient speakers
343
+ filtered_results = [lang for lang in languages_with_scores if lang["speakers"] >= 10_000]
344
 
345
  # Create a list to store data for the scatter plot
346
  scatter_data = []
 
458
  # Collect all country data
459
  population_data = get_population_data()
460
  country_data = {}
461
+ for lang in languages:
462
  # Skip languages without the required data
463
  if "population" not in lang or lang[metric["field_name"]] is None:
464
  continue
 
609
  with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
610
  gr.Markdown("# AI Language Proficiency Benchmark")
611
  gr.Markdown("Comparing language proficiency across different models and languages.")
612
+ start_metric = METRICS[0]
613
 
614
  metric = gr.Dropdown(
615
+ choices=[metric_info["display_name"] for metric_info in METRICS],
616
  value=start_metric["display_name"],
617
  label="Select Metric",
618
  interactive=True,
 
620
  metric_explanation = create_metric_explanation(start_metric)
621
 
622
  gr.Markdown("## Model Comparison")
623
+ # create_leaderboard_df(start_metric)
624
  model_comparison_plot = gr.Plot(
625
  value=create_model_comparison_plot(start_metric),
626
  label="Model Comparison",
 
676
  )
677
 
678
  def update_component(fn, metric_choice):
679
+ metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
680
  return fn(metric)
681
 
 
682
  metric.change(
683
  fn=partial(update_component, create_metric_explanation),
684
  inputs=metric,
evals.py CHANGED
@@ -522,7 +522,7 @@ async def main():
522
  mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
523
  cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
524
  mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
525
- overall_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
526
  results.append(
527
  {
528
  "model": model,
@@ -531,7 +531,7 @@ async def main():
531
  "mt_chrf": mt_chrf,
532
  "cls_acc": cls_acc,
533
  "mlm_chrf": mlm_chrf,
534
- "overall_score": overall_score,
535
  }
536
  )
537
  for model in transcription_models:
@@ -550,7 +550,7 @@ async def main():
550
  "model_type": "speech-to-text",
551
  "asr_wer": asr_wer,
552
  "asr_chrf": asr_chrf,
553
- "overall_score": (asr_wer + asr_chrf) / 2,
554
  }
555
  )
556
  language_results = {
@@ -574,7 +574,8 @@ async def main():
574
  "mlm_chrf",
575
  "asr_wer",
576
  "asr_chrf",
577
- "overall_score",
 
578
  ]:
579
  language_results[score] = mean(
580
  [s[score] for s in results if score in s]
 
522
  mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
523
  cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
524
  mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
525
+ t2t_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
526
  results.append(
527
  {
528
  "model": model,
 
531
  "mt_chrf": mt_chrf,
532
  "cls_acc": cls_acc,
533
  "mlm_chrf": mlm_chrf,
534
+ "t2t_score": t2t_score,
535
  }
536
  )
537
  for model in transcription_models:
 
550
  "model_type": "speech-to-text",
551
  "asr_wer": asr_wer,
552
  "asr_chrf": asr_chrf,
553
+ "s2t_score": (asr_wer + asr_chrf) / 2,
554
  }
555
  )
556
  language_results = {
 
574
  "mlm_chrf",
575
  "asr_wer",
576
  "asr_chrf",
577
+ "t2t_score",
578
+ "s2t_score",
579
  ]:
580
  language_results[score] = mean(
581
  [s[score] for s in results if score in s]
results.json CHANGED
The diff for this file is too large to render. See raw diff