David Pomerenke commited on
Commit
ba2a2f0
·
1 Parent(s): e32fd78

UI logic for selecting model type and metric

Browse files
Files changed (3) hide show
  1. app.py +140 -205
  2. evals.py +19 -13
  3. uv.lock +1 -1
app.py CHANGED
@@ -87,7 +87,7 @@ METRICS = {
87
  """,
88
  },
89
  {
90
- "display_name": "Automatic Speech Recognition ChrF",
91
  "field_name": "asr_chrf",
92
  "label": "ChrF",
93
  "explanation": """
@@ -104,123 +104,50 @@ def mean(lst):
104
  return sum(lst) / len(lst)
105
 
106
 
107
- def create_leaderboard_df(metric):
108
- # Sort languages by average BLEU to determine resource categories
109
- langs_with_score = [
110
- lang for lang in languages_with_scores if lang[metric["field_name"]] is not None
 
 
 
 
 
 
 
 
 
 
 
111
  ]
112
- sorted_langs = sorted(
113
- langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
 
 
 
114
  )
115
- n_langs = len(sorted_langs)
116
- high_cutoff = n_langs // 4 # top 25%
117
- low_cutoff = n_langs - n_langs // 4 # bottom 25%
118
-
119
- # Create sets of languages for each category
120
- high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
121
- low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}
122
-
123
- # Get all model scores with categorization
124
- model_scores = {}
125
- for lang in languages_with_scores:
126
- category = (
127
- "High-Resource"
128
- if lang["language_name"] in high_resource
129
- else "Low-Resource"
130
- if lang["language_name"] in low_resource
131
- else "Mid-Resource"
132
- )
133
-
134
- for score in lang["scores"]:
135
- model = score["model"]
136
- if model not in model_scores:
137
- model_scores[model] = {
138
- "High-Resource": [],
139
- "Mid-Resource": [],
140
- "Low-Resource": [],
141
- }
142
- # Check if the metric field exists in the score dictionary before accessing it
143
- if metric["field_name"] in score:
144
- model_scores[model][category].append(score[metric["field_name"]])
145
- # If the metric is missing, we'll skip this score
146
-
147
- # Calculate average scores and create DataFrame
148
- leaderboard_data = []
149
- for model, categories in model_scores.items():
150
- # Calculate averages for each category
151
- high_avg = (
152
- round(mean(categories["High-Resource"]), 3)
153
- if categories["High-Resource"]
154
- else 0
155
- )
156
- mid_avg = (
157
- round(mean(categories["Mid-Resource"]), 3)
158
- if categories["Mid-Resource"]
159
- else 0
160
- )
161
- low_avg = (
162
- round(mean(categories["Low-Resource"]), 3)
163
- if categories["Low-Resource"]
164
- else 0
165
- )
166
-
167
- # Calculate overall average
168
- all_scores = (
169
- categories["High-Resource"]
170
- + categories["Mid-Resource"]
171
- + categories["Low-Resource"]
172
- )
173
- # Check if all_scores is empty to avoid division by zero
174
- overall_avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else 0
175
-
176
- model_name = model.split("/")[-1]
177
- leaderboard_data.append(
178
- {
179
- "Model": f"[{model_name}](https://openrouter.ai/{model})",
180
- "Overall Score": overall_avg,
181
- "High-Resource Score": high_avg,
182
- "Mid-Resource Score": mid_avg,
183
- "Low-Resource Score": low_avg,
184
- "Languages Tested": len(all_scores),
185
- }
186
- )
187
-
188
- # Sort by overall BLEU
189
- df = pd.DataFrame(leaderboard_data)
190
- df = df.sort_values("Overall Score", ascending=False)
191
-
192
- # Add rank and medals
193
  df["Rank"] = range(1, len(df) + 1)
194
  df["Rank"] = df["Rank"].apply(
195
  lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
196
  )
197
-
198
- # Reorder columns
199
- df = df[
200
- [
201
- "Rank",
202
- "Model",
203
- "Overall Score",
204
- "High-Resource Score",
205
- "Mid-Resource Score",
206
- "Low-Resource Score",
207
- "Languages Tested",
208
- ]
209
- ]
210
-
211
  return gr.DataFrame(
212
  value=df,
213
  label="Model Leaderboard",
214
  show_search=False,
215
- datatype=[
216
- "number",
217
- "markdown",
218
- "number",
219
- "number",
220
- "number",
221
- "number",
222
- "number",
223
- ],
224
  )
225
 
226
 
@@ -292,7 +219,7 @@ def create_language_stats_df(metric):
292
  else "N/A"
293
  )
294
  commonvoice_link = (
295
- f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>"
296
  if lang["commonvoice_hours"]
297
  else "N/A"
298
  )
@@ -303,18 +230,18 @@ def create_language_stats_df(metric):
303
  # "Overall": round(lang["overall_score"], 3)
304
  # if lang["overall_score"] is not None
305
  # else "N/A",
306
- "Translation": round(lang["mt_bleu"], 3)
307
- if lang["mt_bleu"] is not None
 
308
  else "N/A",
309
- "Classification": round(lang["cls_acc"], 3)
310
  if lang["cls_acc"] is not None
311
  else "N/A",
312
  "MLM": round(lang["mlm_chrf"], 3)
313
  if lang["mlm_chrf"] is not None
314
  else "N/A",
315
- "ASR": round(lang["asr_wer"], 3) if lang["asr_wer"] is not None else "N/A",
316
- "Best Model": model_link,
317
- "CommonVoice Hours": commonvoice_link,
318
  }
319
  flat_data.append(row)
320
 
@@ -327,40 +254,36 @@ def create_language_stats_df(metric):
327
  column_widths=[
328
  "100px",
329
  "100px",
330
- "100px",
331
- "100px",
332
- "100px",
333
- "100px",
334
- "100px",
335
- "100px",
336
- "100px",
337
- "100px",
338
  ],
339
  datatype=[
340
  "markdown", # Language
341
  "number", # Speakers
342
  # "number", # Models Tested
343
- "number", # Overall
 
344
  "number", # Translation
345
  "number", # Classification
346
  "number", # MLM
347
  "number", # ASR
348
- "markdown", # Best Model
349
  "markdown", # CommonVoice Hours
350
  ],
351
  )
352
 
353
 
354
  def create_scatter_plot(metric):
355
- # Filter results to include only languages with sufficient speakers
356
- filtered_results = [
357
- lang for lang in languages_with_scores if lang["speakers"] >= 10_000
358
- ]
359
-
360
  # Create a list to store data for the scatter plot
361
  scatter_data = []
362
-
363
- for lang in filtered_results:
 
364
  # Calculate average score for this metric across all models
365
  scores = [
366
  score[metric["field_name"]]
@@ -374,32 +297,44 @@ def create_scatter_plot(metric):
374
  "language": lang["language_name"],
375
  "speakers": lang["speakers"],
376
  "score": avg_score,
 
377
  }
378
  )
379
 
380
  fig = go.Figure()
381
-
382
- # Convert speakers to millions for display
383
- x_vals = [
384
- data["speakers"] / 1_000_000 for data in scatter_data
385
- ] # Convert to millions
386
  y_vals = [data["score"] for data in scatter_data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
387
  labels = [data["language"] for data in scatter_data]
388
-
389
- # Create hover template
390
  hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
391
-
392
  fig.add_trace(
393
  go.Scatter(
394
  x=x_vals,
395
  y=y_vals,
 
396
  mode="markers+text",
397
  text=labels,
398
  textposition="top center",
399
  hovertemplate=hover_template,
400
  )
401
  )
402
-
403
  fig.update_layout(
404
  title=None,
405
  xaxis_title="Number of Speakers (Millions)",
@@ -407,10 +342,7 @@ def create_scatter_plot(metric):
407
  height=500,
408
  showlegend=False,
409
  )
410
-
411
- # Use log scale for x-axis since speaker numbers vary widely
412
  fig.update_xaxes(type="log")
413
-
414
  return fig
415
 
416
 
@@ -569,7 +501,6 @@ def create_world_map(metric):
569
  scores.append(weighted_avg)
570
  hover_texts.append(hover_text)
571
 
572
- # Create the choropleth map
573
  fig = go.Figure(
574
  data=go.Choropleth(
575
  locations=countries,
@@ -616,11 +547,21 @@ def create_world_map(metric):
616
  return fig
617
 
618
 
 
 
 
 
 
 
 
 
 
 
 
619
  def create_metric_explanation(metric):
620
  return gr.Markdown(metric["explanation"], container=True)
621
 
622
 
623
-
624
  # Create the visualization components
625
  with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
626
  gr.Markdown("# AI Language Proficiency Benchmark")
@@ -639,12 +580,6 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
639
  with gr.Row():
640
  with gr.Column():
641
  with gr.Accordion("Model Filters", open=False):
642
- model_type = gr.Radio(
643
- choices=["Text-to-Text", "Speech-to-Text"],
644
- value="Text-to-Text",
645
- label="Select Model Type",
646
- interactive=True,
647
- )
648
  model_licenses = gr.CheckboxGroup(
649
  choices=["open source", "commercial"],
650
  value=["open source", "commercial"],
@@ -667,26 +602,6 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
667
  label="Select Unit of Analysis",
668
  interactive=True,
669
  )
670
- region_filter = gr.CheckboxGroup(
671
- choices=[
672
- "Africa",
673
- "Asia",
674
- "Europe",
675
- "North America",
676
- "South America",
677
- "Oceania",
678
- ],
679
- value=[
680
- "Africa",
681
- "Asia",
682
- "Europe",
683
- "North America",
684
- "South America",
685
- "Oceania",
686
- ],
687
- label="Filter by Region",
688
- interactive=True,
689
- )
690
  family_filter = gr.CheckboxGroup(
691
  choices=[
692
  "Indo-European",
@@ -717,19 +632,27 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
717
  interactive=True,
718
  )
719
  with gr.Row():
720
- start_metric = METRICS["t2t"][0]
721
- metric = gr.Dropdown(
722
- choices=[metric["display_name"] for metric in METRICS["t2t"]],
723
- value=start_metric["display_name"],
724
- label="Main metric to display in figures and map",
725
- interactive=True,
726
- )
 
 
 
 
 
 
 
 
727
 
728
  metric_explanation = create_metric_explanation(start_metric)
729
 
730
  gr.Markdown("## Model Comparison")
731
- create_leaderboard_df(start_metric)
732
-
733
  model_comparison_plot = gr.Plot(
734
  value=create_model_comparison_plot(start_metric),
735
  label="Model Comparison",
@@ -748,34 +671,47 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
748
  elem_classes="fullwidth-plot",
749
  )
750
 
751
- def update_component(fn, metric_choice):
752
- metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
 
 
 
 
 
 
 
 
 
 
 
 
 
753
  return fn(metric)
754
 
755
- # metric.change(
756
- # fn=partial(update_component, create_metric_explanation),
757
- # inputs=metric,
758
- # outputs=metric_explanation,
759
- # )
760
- # metric.change(
761
- # fn=partial(update_component, create_model_comparison_plot),
762
- # inputs=metric,
763
- # outputs=model_comparison_plot,
764
- # )
765
- # metric.change(
766
- # fn=partial(update_component, create_scatter_plot),
767
- # inputs=metric,
768
- # outputs=scatter_plot,
769
- # )
770
- # metric.change(
771
- # fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
772
- # )
 
 
773
 
774
  with gr.Accordion("Methodology", open=False):
775
  gr.Markdown(
776
  """
777
- ## Methodology
778
-
779
  ### Benchmark Data
780
  We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
781
 
@@ -804,8 +740,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
804
  - Evaluate predictions using ChrF score against the original text
805
 
806
  The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
807
- """,
808
- container=True,
809
  )
810
 
811
  demo.launch()
 
87
  """,
88
  },
89
  {
90
+ "display_name": "Automatic Speech Recognition (ChrF)",
91
  "field_name": "asr_chrf",
92
  "label": "ChrF",
93
  "explanation": """
 
104
  return sum(lst) / len(lst)
105
 
106
 
107
+ def create_leaderboard_df(model_type, metric=None):
108
+ metric = metric or METRICS[model_type][0]
109
+ _model_type = {"t2t": "text-to-text", "s2t": "speech-to-text"}[model_type]
110
+ models = {
111
+ score["model"]
112
+ for lang in languages_with_scores
113
+ for score in lang["scores"]
114
+ if score["model_type"] == _model_type
115
+ }
116
+ model_scores = [
117
+ {"model": score["model"], metric["field_name"]: score[metric["field_name"]]}
118
+ for lang in languages_with_scores
119
+ for score in lang["scores"]
120
+ for model in models
121
+ if score["model"] == model
122
  ]
123
+ df = (
124
+ pd.DataFrame(model_scores)
125
+ .groupby("model")
126
+ .agg({metric["field_name"]: ["mean", "count"]})
127
+ .reset_index()
128
  )
129
+ # Flatten the multi-level column names
130
+ df.columns = df.columns.map(
131
+ lambda x: f"{x[0]}_{x[1]}" if isinstance(x, tuple) else x
132
+ )
133
+ df = df.rename(
134
+ columns={
135
+ f"{metric['field_name']}_mean": metric["label"],
136
+ f"{metric['field_name']}_count": "Languages Tested",
137
+ "model_": "Model",
138
+ }
139
+ )
140
+ df = df.sort_values(metric["label"], ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  df["Rank"] = range(1, len(df) + 1)
142
  df["Rank"] = df["Rank"].apply(
143
  lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
144
  )
145
+ df = df[["Rank", "Model", metric["label"], "Languages Tested"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  return gr.DataFrame(
147
  value=df,
148
  label="Model Leaderboard",
149
  show_search=False,
150
+ datatype=["number", "markdown", "number", "number"],
 
 
 
 
 
 
 
 
151
  )
152
 
153
 
 
219
  else "N/A"
220
  )
221
  commonvoice_link = (
222
+ f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {round(lang['commonvoice_hours'])}h</a>"
223
  if lang["commonvoice_hours"]
224
  else "N/A"
225
  )
 
230
  # "Overall": round(lang["overall_score"], 3)
231
  # if lang["overall_score"] is not None
232
  # else "N/A",
233
+ "Best Model": model_link,
234
+ "MT": round(lang["mt_chrf"], 3)
235
+ if lang["mt_chrf"] is not None
236
  else "N/A",
237
+ "CLS": round(lang["cls_acc"], 3)
238
  if lang["cls_acc"] is not None
239
  else "N/A",
240
  "MLM": round(lang["mlm_chrf"], 3)
241
  if lang["mlm_chrf"] is not None
242
  else "N/A",
243
+ "ASR": round(lang["asr_chrf"], 3) if lang["asr_wer"] is not None else "N/A",
244
+ "Common Voice": commonvoice_link,
 
245
  }
246
  flat_data.append(row)
247
 
 
254
  column_widths=[
255
  "100px",
256
  "100px",
257
+ # "100px",
258
+ # "100px",
259
+ "200px", # Best Model
260
+ "100px", # MT
261
+ "100px", # CLS
262
+ "100px", # MLM
263
+ "100px", # ASR
264
+ "100px", # Common Voice
265
  ],
266
  datatype=[
267
  "markdown", # Language
268
  "number", # Speakers
269
  # "number", # Models Tested
270
+ # "number", # Overall
271
+ "markdown", # Best Model
272
  "number", # Translation
273
  "number", # Classification
274
  "number", # MLM
275
  "number", # ASR
 
276
  "markdown", # CommonVoice Hours
277
  ],
278
  )
279
 
280
 
281
  def create_scatter_plot(metric):
 
 
 
 
 
282
  # Create a list to store data for the scatter plot
283
  scatter_data = []
284
+ for lang in languages_with_scores:
285
+ if lang["speakers"] < 10_000:
286
+ continue
287
  # Calculate average score for this metric across all models
288
  scores = [
289
  score[metric["field_name"]]
 
297
  "language": lang["language_name"],
298
  "speakers": lang["speakers"],
299
  "score": avg_score,
300
+ "family": lang["language_family"],
301
  }
302
  )
303
 
304
  fig = go.Figure()
305
+ x_vals = [data["speakers"] / 1_000_000 for data in scatter_data]
 
 
 
 
306
  y_vals = [data["score"] for data in scatter_data]
307
+ s_vals = [data["speakers"] / 20_000_000 for data in scatter_data]
308
+ color_pallette = [
309
+ "LightSkyBlue",
310
+ "LightGreen",
311
+ "LightCoral",
312
+ "LightPink",
313
+ "LightGoldenRodYellow",
314
+ "LightGray",
315
+ "LightSalmon",
316
+ "LightSeaGreen",
317
+ ]
318
+ color_mapping = {
319
+ family: color
320
+ for family, color in zip(
321
+ sorted(set(data["family"] for data in scatter_data)), color_pallette
322
+ )
323
+ }
324
+ c_vals = [color_mapping[data["family"]] for data in scatter_data]
325
  labels = [data["language"] for data in scatter_data]
 
 
326
  hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
 
327
  fig.add_trace(
328
  go.Scatter(
329
  x=x_vals,
330
  y=y_vals,
331
+ marker=dict(size=s_vals, color=c_vals),
332
  mode="markers+text",
333
  text=labels,
334
  textposition="top center",
335
  hovertemplate=hover_template,
336
  )
337
  )
 
338
  fig.update_layout(
339
  title=None,
340
  xaxis_title="Number of Speakers (Millions)",
 
342
  height=500,
343
  showlegend=False,
344
  )
 
 
345
  fig.update_xaxes(type="log")
 
346
  return fig
347
 
348
 
 
501
  scores.append(weighted_avg)
502
  hover_texts.append(hover_text)
503
 
 
504
  fig = go.Figure(
505
  data=go.Choropleth(
506
  locations=countries,
 
547
  return fig
548
 
549
 
550
+ def create_metric_selector(model_type):
551
+ match model_type:
552
+ case "t2t":
553
+ choices = [m["display_name"] for m in METRICS["t2t"]]
554
+ case "s2t":
555
+ choices = [m["display_name"] for m in METRICS["s2t"]]
556
+ return gr.Dropdown(
557
+ choices=choices, value=choices[0], label="Select Metric", interactive=True
558
+ )
559
+
560
+
561
  def create_metric_explanation(metric):
562
  return gr.Markdown(metric["explanation"], container=True)
563
 
564
 
 
565
  # Create the visualization components
566
  with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
567
  gr.Markdown("# AI Language Proficiency Benchmark")
 
580
  with gr.Row():
581
  with gr.Column():
582
  with gr.Accordion("Model Filters", open=False):
 
 
 
 
 
 
583
  model_licenses = gr.CheckboxGroup(
584
  choices=["open source", "commercial"],
585
  value=["open source", "commercial"],
 
602
  label="Select Unit of Analysis",
603
  interactive=True,
604
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  family_filter = gr.CheckboxGroup(
606
  choices=[
607
  "Indo-European",
 
632
  interactive=True,
633
  )
634
  with gr.Row():
635
+ with gr.Column():
636
+ start_model_type = "Text-to-Text"
637
+ model_type = gr.Radio(
638
+ choices=["Text-to-Text", "Speech-to-Text"],
639
+ value=start_model_type,
640
+ label="Select Model Type",
641
+ interactive=True,
642
+ )
643
+ start_metric = METRICS["t2t"][0]
644
+ metric = gr.Dropdown(
645
+ choices=[metric["display_name"] for metric in METRICS["t2t"]],
646
+ value=start_metric["display_name"],
647
+ label="Main task and metric to display in figures and map",
648
+ interactive=True,
649
+ )
650
 
651
  metric_explanation = create_metric_explanation(start_metric)
652
 
653
  gr.Markdown("## Model Comparison")
654
+ leaderboard_df = create_leaderboard_df("t2t", start_metric)
655
+
656
  model_comparison_plot = gr.Plot(
657
  value=create_model_comparison_plot(start_metric),
658
  label="Model Comparison",
 
671
  elem_classes="fullwidth-plot",
672
  )
673
 
674
+ def update_model_type(model_type_choice):
675
+ model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice]
676
+ return create_metric_selector(model_type), create_leaderboard_df(model_type)
677
+
678
+ model_type.change(
679
+ fn=update_model_type,
680
+ inputs=model_type,
681
+ outputs=[metric, leaderboard_df],
682
+ )
683
+
684
+ def update_component(fn, model_type_choice, metric_choice):
685
+ model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice]
686
+ metric = [m for m in METRICS[model_type] if m["display_name"] == metric_choice][
687
+ 0
688
+ ]
689
  return fn(metric)
690
 
691
+ metric.change(
692
+ fn=partial(update_component, create_metric_explanation),
693
+ inputs=[model_type, metric],
694
+ outputs=metric_explanation,
695
+ )
696
+ metric.change(
697
+ fn=partial(update_component, create_model_comparison_plot),
698
+ inputs=[model_type, metric],
699
+ outputs=model_comparison_plot,
700
+ )
701
+ metric.change(
702
+ fn=partial(update_component, create_scatter_plot),
703
+ inputs=[model_type, metric],
704
+ outputs=scatter_plot,
705
+ )
706
+ metric.change(
707
+ fn=partial(update_component, create_world_map),
708
+ inputs=[model_type, metric],
709
+ outputs=world_map,
710
+ )
711
 
712
  with gr.Accordion("Methodology", open=False):
713
  gr.Markdown(
714
  """
 
 
715
  ### Benchmark Data
716
  We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
717
 
 
740
  - Evaluate predictions using ChrF score against the original text
741
 
742
  The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
743
+ """
 
744
  )
745
 
746
  demo.launch()
evals.py CHANGED
@@ -93,11 +93,15 @@ def population(bcp_47):
93
  }
94
  return items
95
 
96
- glottolog = pd.read_csv("data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False) # Min _Nan_ Chinese is not N/A!
 
 
 
97
  glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
98
  lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
99
  )
100
 
 
101
  @cache
102
  def language_family(bcp_47):
103
  languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
@@ -106,6 +110,7 @@ def language_family(bcp_47):
106
  family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
107
  return family["name"]
108
 
 
109
  def script_name(iso15924):
110
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
111
 
@@ -255,17 +260,20 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
255
  max_tokens=1024,
256
  )
257
  prediction = reply.choices[0].message.content.strip()
258
- bleu_score = bleu.compute(
259
- predictions=[prediction],
260
- references=[target_sentence],
261
- tokenizer=tokenizer.tokenize,
262
- )
 
 
 
263
  chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
264
  return {
265
  "model": model,
266
  "bcp_47": original_language["bcp_47"],
267
  "mt_bleu": bleu_score["bleu"],
268
- "mt_chrf": chrf_score["score"],
269
  "sentence_nr": sentence_nr,
270
  }
271
 
@@ -371,7 +379,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
371
  return {
372
  "model": model,
373
  "bcp_47": language["bcp_47"],
374
- "mlm_chrf": chrf_score["score"],
375
  "sentence_nr": nr,
376
  }
377
 
@@ -432,7 +440,7 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
432
  "model": model,
433
  "bcp_47": language["bcp_47"],
434
  "asr_wer": wer_score,
435
- "asr_chrf": chrf_score["score"],
436
  "sentence_nr": nr,
437
  }
438
 
@@ -522,7 +530,7 @@ async def main():
522
  mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
523
  cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
524
  mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
525
- t2t_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
526
  results.append(
527
  {
528
  "model": model,
@@ -577,9 +585,7 @@ async def main():
577
  "t2t_score",
578
  "s2t_score",
579
  ]:
580
- language_results[score] = mean(
581
- [s[score] for s in results if score in s]
582
- )
583
  all_results.append(language_results)
584
  with open("results.json", "w") as f:
585
  json.dump(all_results, f, indent=2, ensure_ascii=False)
 
93
  }
94
  return items
95
 
96
+
97
+ glottolog = pd.read_csv(
98
+ "data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
99
+ ) # Min _Nan_ Chinese is not N/A!
100
  glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
101
  lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
102
  )
103
 
104
+
105
  @cache
106
  def language_family(bcp_47):
107
  languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
 
110
  family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
111
  return family["name"]
112
 
113
+
114
  def script_name(iso15924):
115
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
116
 
 
260
  max_tokens=1024,
261
  )
262
  prediction = reply.choices[0].message.content.strip()
263
+ if prediction.strip():
264
+ bleu_score = bleu.compute(
265
+ predictions=[prediction],
266
+ references=[target_sentence],
267
+ tokenizer=tokenizer.tokenize,
268
+ )
269
+ else:
270
+ bleu_score = {"bleu": 0}
271
  chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
272
  return {
273
  "model": model,
274
  "bcp_47": original_language["bcp_47"],
275
  "mt_bleu": bleu_score["bleu"],
276
+ "mt_chrf": chrf_score["score"] / 100,
277
  "sentence_nr": sentence_nr,
278
  }
279
 
 
379
  return {
380
  "model": model,
381
  "bcp_47": language["bcp_47"],
382
+ "mlm_chrf": chrf_score["score"] / 100,
383
  "sentence_nr": nr,
384
  }
385
 
 
440
  "model": model,
441
  "bcp_47": language["bcp_47"],
442
  "asr_wer": wer_score,
443
+ "asr_chrf": chrf_score["score"] / 100,
444
  "sentence_nr": nr,
445
  }
446
 
 
530
  mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
531
  cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
532
  mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
533
+ t2t_score = (mt_chrf + cls_acc + mlm_chrf) / 3
534
  results.append(
535
  {
536
  "model": model,
 
585
  "t2t_score",
586
  "s2t_score",
587
  ]:
588
+ language_results[score] = mean([s[score] for s in results if score in s])
 
 
589
  all_results.append(language_results)
590
  with open("results.json", "w") as f:
591
  json.dump(all_results, f, indent=2, ensure_ascii=False)
uv.lock CHANGED
@@ -1000,7 +1000,7 @@ dev = [
1000
  [package.metadata]
1001
  requires-dist = [
1002
  { name = "gradio", specifier = ">=5.16.2" },
1003
- { name = "gradio-rangeslider" },
1004
  { name = "language-data", specifier = ">=1.3.0" },
1005
  { name = "pandas", specifier = ">=2.2.3" },
1006
  { name = "plotly", specifier = ">=6.0.0" },
 
1000
  [package.metadata]
1001
  requires-dist = [
1002
  { name = "gradio", specifier = ">=5.16.2" },
1003
+ { name = "gradio-rangeslider", specifier = ">=0.0.8" },
1004
  { name = "language-data", specifier = ">=1.3.0" },
1005
  { name = "pandas", specifier = ">=2.2.3" },
1006
  { name = "plotly", specifier = ">=6.0.0" },