David Pomerenke commited on
Commit
63fd3b1
·
1 Parent(s): 4f572a5

Improve methodology

Browse files
Files changed (1) hide show
  1. app.py +68 -43
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import json
 
2
 
3
  import gradio as gr
4
  import pandas as pd
@@ -68,8 +69,12 @@ def mean(lst):
68
 
69
  def create_leaderboard_df(metric):
70
  # Sort languages by average BLEU to determine resource categories
71
- langs_with_score = [lang for lang in results if lang[metric['field_name']] is not None]
72
- sorted_langs = sorted(langs_with_score, key=lambda x: x[metric['field_name']], reverse=True)
 
 
 
 
73
  n_langs = len(sorted_langs)
74
  high_cutoff = n_langs // 4 # top 25%
75
  low_cutoff = n_langs - n_langs // 4 # bottom 25%
@@ -97,7 +102,7 @@ def create_leaderboard_df(metric):
97
  "Mid-Resource": [],
98
  "Low-Resource": [],
99
  }
100
- model_scores[model][category].append(score[metric['field_name']])
101
 
102
  # Calculate average scores and create DataFrame
103
  leaderboard_data = []
@@ -183,14 +188,14 @@ def create_model_comparison_plot(metric):
183
 
184
  # Create appropriate title and y-axis label based on metric
185
  title = f"{metric['display_name']} by Model and Language"
186
- y_label = metric['label']
187
 
188
  # Flatten the data for the selected metric
189
  scores_flat = []
190
  for lang in top_languages:
191
  for score in lang["scores"]:
192
  # Get the value directly using the field name
193
- value = score[metric['field_name']]
194
  if value is not None:
195
  scores_flat.append(
196
  {
@@ -292,9 +297,9 @@ def create_scatter_plot(metric):
292
  for lang in filtered_results:
293
  # Calculate average score for this metric across all models
294
  scores = [
295
- score[metric['field_name']]
296
  for score in lang["scores"]
297
- if score[metric['field_name']] is not None
298
  ]
299
  if scores: # Only include if we have valid scores
300
  avg_score = sum(scores) / len(scores)
@@ -332,7 +337,7 @@ def create_scatter_plot(metric):
332
  fig.update_layout(
333
  title=None,
334
  xaxis_title="Number of Speakers (Millions)",
335
- yaxis_title=metric['label'],
336
  height=500,
337
  showlegend=False,
338
  )
@@ -368,6 +373,7 @@ def get_population_data():
368
  data[t_code] = t_population
369
  return data
370
 
 
371
  # Helper functions for visualization
372
  def make_black_bar(value, max_width=10):
373
  filled = int(value * max_width)
@@ -396,13 +402,14 @@ def make_colored_bar(score, max_width=10):
396
  else:
397
  return "🟥" * filled + "⬜" * empty
398
 
 
399
  def create_world_map(metric):
400
  # Collect all country data
401
  population_data = get_population_data()
402
  country_data = {}
403
  for lang in results:
404
  # Skip languages without the required data
405
- if "population" not in lang or lang[metric['field_name']] is None:
406
  continue
407
 
408
  for country_code, speakers in lang["population"].items():
@@ -423,13 +430,13 @@ def create_world_map(metric):
423
 
424
  country_data[iso3_code]["total_speakers"] += speakers
425
  country_data[iso3_code]["weighted_score_sum"] += (
426
- speakers * lang[metric['field_name']]
427
  )
428
  country_data[iso3_code]["languages"].append(
429
  {
430
  "name": lang["language_name"],
431
  "speakers": speakers,
432
- "score": lang[metric['field_name']],
433
  }
434
  )
435
  except (KeyError, AttributeError):
@@ -506,7 +513,7 @@ def create_world_map(metric):
506
  hoverinfo="text",
507
  colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
508
  colorbar=dict(
509
- title=metric['label'],
510
  orientation="h", # horizontal orientation
511
  y=-0.2, # position below map
512
  yanchor="bottom",
@@ -519,7 +526,9 @@ def create_world_map(metric):
519
  )
520
 
521
  fig.update_layout(
522
- title=dict(text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"),
 
 
523
  geo=dict(
524
  showframe=True,
525
  showcoastlines=True,
@@ -540,23 +549,19 @@ def create_world_map(metric):
540
 
541
  return fig
542
 
 
543
  def create_metric_explanation(metric):
544
- return gr.Markdown(metric['explanation'])
545
 
546
 
547
  # Create the visualization components
548
  with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
549
  gr.Markdown("# AI Language Proficiency Benchmark")
550
- gr.Markdown(
551
- "Comparing language proficiency across different models and languages."
552
- )
553
  start_metric = METRICS["overall_performance"]
554
 
555
  metric = gr.Dropdown(
556
- choices=[
557
- metric_info["display_name"]
558
- for metric_info in METRICS.values()
559
- ],
560
  value=start_metric["display_name"],
561
  label="Select Metric",
562
  interactive=True,
@@ -586,38 +591,58 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
586
  gr.Markdown(
587
  """
588
  ## Methodology
589
- ### Dataset
590
- - Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages
591
- - Each language is tested with the same 100 sentences
592
- - All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers
593
- - Language statistics sourced from Ethnologue and Wikidata
594
-
595
- ### Models & Evaluation
596
- - Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed
597
- - **BLEU Score**: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better
598
-
599
- ### Language Categories
600
- Languages are divided into three tiers based on translation difficulty:
601
- - High-Resource: Top 25% of languages by BLEU score (easiest to translate)
602
- - Mid-Resource: Middle 50% of languages
603
- - Low-Resource: Bottom 25% of languages (hardest to translate)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
  """,
605
  container=True,
606
  )
607
-
608
  def update_component(fn, metric_choice):
609
  metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
610
  return fn(metric)
611
-
612
- from functools import partial
613
 
614
- # Connect the dropdown to update all plots
615
- metric.change(fn=partial(update_component, create_metric_explanation), inputs=metric, outputs=metric_explanation)
 
 
 
 
616
  metric.change(
617
- fn=partial(update_component, create_model_comparison_plot), inputs=metric, outputs=model_comparison_plot
 
 
618
  )
619
  metric.change(
620
- fn=partial(update_component, create_scatter_plot), inputs=metric, outputs=scatter_plot
 
 
621
  )
622
  metric.change(
623
  fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
 
1
  import json
2
+ from functools import partial
3
 
4
  import gradio as gr
5
  import pandas as pd
 
69
 
70
  def create_leaderboard_df(metric):
71
  # Sort languages by average BLEU to determine resource categories
72
+ langs_with_score = [
73
+ lang for lang in results if lang[metric["field_name"]] is not None
74
+ ]
75
+ sorted_langs = sorted(
76
+ langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
77
+ )
78
  n_langs = len(sorted_langs)
79
  high_cutoff = n_langs // 4 # top 25%
80
  low_cutoff = n_langs - n_langs // 4 # bottom 25%
 
102
  "Mid-Resource": [],
103
  "Low-Resource": [],
104
  }
105
+ model_scores[model][category].append(score[metric["field_name"]])
106
 
107
  # Calculate average scores and create DataFrame
108
  leaderboard_data = []
 
188
 
189
  # Create appropriate title and y-axis label based on metric
190
  title = f"{metric['display_name']} by Model and Language"
191
+ y_label = metric["label"]
192
 
193
  # Flatten the data for the selected metric
194
  scores_flat = []
195
  for lang in top_languages:
196
  for score in lang["scores"]:
197
  # Get the value directly using the field name
198
+ value = score[metric["field_name"]]
199
  if value is not None:
200
  scores_flat.append(
201
  {
 
297
  for lang in filtered_results:
298
  # Calculate average score for this metric across all models
299
  scores = [
300
+ score[metric["field_name"]]
301
  for score in lang["scores"]
302
+ if score[metric["field_name"]] is not None
303
  ]
304
  if scores: # Only include if we have valid scores
305
  avg_score = sum(scores) / len(scores)
 
337
  fig.update_layout(
338
  title=None,
339
  xaxis_title="Number of Speakers (Millions)",
340
+ yaxis_title=metric["label"],
341
  height=500,
342
  showlegend=False,
343
  )
 
373
  data[t_code] = t_population
374
  return data
375
 
376
+
377
  # Helper functions for visualization
378
  def make_black_bar(value, max_width=10):
379
  filled = int(value * max_width)
 
402
  else:
403
  return "🟥" * filled + "⬜" * empty
404
 
405
+
406
  def create_world_map(metric):
407
  # Collect all country data
408
  population_data = get_population_data()
409
  country_data = {}
410
  for lang in results:
411
  # Skip languages without the required data
412
+ if "population" not in lang or lang[metric["field_name"]] is None:
413
  continue
414
 
415
  for country_code, speakers in lang["population"].items():
 
430
 
431
  country_data[iso3_code]["total_speakers"] += speakers
432
  country_data[iso3_code]["weighted_score_sum"] += (
433
+ speakers * lang[metric["field_name"]]
434
  )
435
  country_data[iso3_code]["languages"].append(
436
  {
437
  "name": lang["language_name"],
438
  "speakers": speakers,
439
+ "score": lang[metric["field_name"]],
440
  }
441
  )
442
  except (KeyError, AttributeError):
 
513
  hoverinfo="text",
514
  colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
515
  colorbar=dict(
516
+ title=metric["label"],
517
  orientation="h", # horizontal orientation
518
  y=-0.2, # position below map
519
  yanchor="bottom",
 
526
  )
527
 
528
  fig.update_layout(
529
+ title=dict(
530
+ text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"
531
+ ),
532
  geo=dict(
533
  showframe=True,
534
  showcoastlines=True,
 
549
 
550
  return fig
551
 
552
+
553
  def create_metric_explanation(metric):
554
+ return gr.Markdown(metric["explanation"])
555
 
556
 
557
  # Create the visualization components
558
  with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
559
  gr.Markdown("# AI Language Proficiency Benchmark")
560
+ gr.Markdown("Comparing language proficiency across different models and languages.")
 
 
561
  start_metric = METRICS["overall_performance"]
562
 
563
  metric = gr.Dropdown(
564
+ choices=[metric_info["display_name"] for metric_info in METRICS.values()],
 
 
 
565
  value=start_metric["display_name"],
566
  label="Select Metric",
567
  interactive=True,
 
591
  gr.Markdown(
592
  """
593
  ## Methodology
594
+
595
+ ### Benchmark Data
596
+ We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
597
+
598
+ Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
599
+
600
+ ### AI Models
601
+ We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
602
+
603
+ ### Evaluation Tasks
604
+ Our benchmark includes three core tasks to assess different aspects of language understanding:
605
+
606
+ 1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
607
+ - [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
608
+ - [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
609
+
610
+ 2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
611
+ - Group sentences by URL into paragraphs with the same topic
612
+ - Use the 5 most common topics, encoded as numbers rather than English labels
613
+ - Provide 5 examples of each topic as few-shot examples
614
+ - Test the model's ability to classify new text
615
+ - Report accuracy as the primary metric
616
+
617
+ 3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
618
+ - Mask approximately 5% of each sentence at a random position
619
+ - Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
620
+ - Evaluate predictions using ChrF score against the original text
621
+
622
+ The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
623
  """,
624
  container=True,
625
  )
626
+
627
  def update_component(fn, metric_choice):
628
  metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
629
  return fn(metric)
 
 
630
 
631
+
632
+ metric.change(
633
+ fn=partial(update_component, create_metric_explanation),
634
+ inputs=metric,
635
+ outputs=metric_explanation,
636
+ )
637
  metric.change(
638
+ fn=partial(update_component, create_model_comparison_plot),
639
+ inputs=metric,
640
+ outputs=model_comparison_plot,
641
  )
642
  metric.change(
643
+ fn=partial(update_component, create_scatter_plot),
644
+ inputs=metric,
645
+ outputs=scatter_plot,
646
  )
647
  metric.change(
648
  fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map