David Pomerenke
commited on
Commit
·
63fd3b1
1
Parent(s):
4f572a5
Improve methodology
Browse files
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import json
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
@@ -68,8 +69,12 @@ def mean(lst):
|
|
68 |
|
69 |
def create_leaderboard_df(metric):
|
70 |
# Sort languages by average BLEU to determine resource categories
|
71 |
-
langs_with_score = [
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
n_langs = len(sorted_langs)
|
74 |
high_cutoff = n_langs // 4 # top 25%
|
75 |
low_cutoff = n_langs - n_langs // 4 # bottom 25%
|
@@ -97,7 +102,7 @@ def create_leaderboard_df(metric):
|
|
97 |
"Mid-Resource": [],
|
98 |
"Low-Resource": [],
|
99 |
}
|
100 |
-
model_scores[model][category].append(score[metric[
|
101 |
|
102 |
# Calculate average scores and create DataFrame
|
103 |
leaderboard_data = []
|
@@ -183,14 +188,14 @@ def create_model_comparison_plot(metric):
|
|
183 |
|
184 |
# Create appropriate title and y-axis label based on metric
|
185 |
title = f"{metric['display_name']} by Model and Language"
|
186 |
-
y_label = metric[
|
187 |
|
188 |
# Flatten the data for the selected metric
|
189 |
scores_flat = []
|
190 |
for lang in top_languages:
|
191 |
for score in lang["scores"]:
|
192 |
# Get the value directly using the field name
|
193 |
-
value = score[metric[
|
194 |
if value is not None:
|
195 |
scores_flat.append(
|
196 |
{
|
@@ -292,9 +297,9 @@ def create_scatter_plot(metric):
|
|
292 |
for lang in filtered_results:
|
293 |
# Calculate average score for this metric across all models
|
294 |
scores = [
|
295 |
-
score[metric[
|
296 |
for score in lang["scores"]
|
297 |
-
if score[metric[
|
298 |
]
|
299 |
if scores: # Only include if we have valid scores
|
300 |
avg_score = sum(scores) / len(scores)
|
@@ -332,7 +337,7 @@ def create_scatter_plot(metric):
|
|
332 |
fig.update_layout(
|
333 |
title=None,
|
334 |
xaxis_title="Number of Speakers (Millions)",
|
335 |
-
yaxis_title=metric[
|
336 |
height=500,
|
337 |
showlegend=False,
|
338 |
)
|
@@ -368,6 +373,7 @@ def get_population_data():
|
|
368 |
data[t_code] = t_population
|
369 |
return data
|
370 |
|
|
|
371 |
# Helper functions for visualization
|
372 |
def make_black_bar(value, max_width=10):
|
373 |
filled = int(value * max_width)
|
@@ -396,13 +402,14 @@ def make_colored_bar(score, max_width=10):
|
|
396 |
else:
|
397 |
return "🟥" * filled + "⬜" * empty
|
398 |
|
|
|
399 |
def create_world_map(metric):
|
400 |
# Collect all country data
|
401 |
population_data = get_population_data()
|
402 |
country_data = {}
|
403 |
for lang in results:
|
404 |
# Skip languages without the required data
|
405 |
-
if "population" not in lang or lang[metric[
|
406 |
continue
|
407 |
|
408 |
for country_code, speakers in lang["population"].items():
|
@@ -423,13 +430,13 @@ def create_world_map(metric):
|
|
423 |
|
424 |
country_data[iso3_code]["total_speakers"] += speakers
|
425 |
country_data[iso3_code]["weighted_score_sum"] += (
|
426 |
-
speakers * lang[metric[
|
427 |
)
|
428 |
country_data[iso3_code]["languages"].append(
|
429 |
{
|
430 |
"name": lang["language_name"],
|
431 |
"speakers": speakers,
|
432 |
-
"score": lang[metric[
|
433 |
}
|
434 |
)
|
435 |
except (KeyError, AttributeError):
|
@@ -506,7 +513,7 @@ def create_world_map(metric):
|
|
506 |
hoverinfo="text",
|
507 |
colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
|
508 |
colorbar=dict(
|
509 |
-
title=metric[
|
510 |
orientation="h", # horizontal orientation
|
511 |
y=-0.2, # position below map
|
512 |
yanchor="bottom",
|
@@ -519,7 +526,9 @@ def create_world_map(metric):
|
|
519 |
)
|
520 |
|
521 |
fig.update_layout(
|
522 |
-
title=dict(
|
|
|
|
|
523 |
geo=dict(
|
524 |
showframe=True,
|
525 |
showcoastlines=True,
|
@@ -540,23 +549,19 @@ def create_world_map(metric):
|
|
540 |
|
541 |
return fig
|
542 |
|
|
|
543 |
def create_metric_explanation(metric):
|
544 |
-
return gr.Markdown(metric[
|
545 |
|
546 |
|
547 |
# Create the visualization components
|
548 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
549 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
550 |
-
gr.Markdown(
|
551 |
-
"Comparing language proficiency across different models and languages."
|
552 |
-
)
|
553 |
start_metric = METRICS["overall_performance"]
|
554 |
|
555 |
metric = gr.Dropdown(
|
556 |
-
choices=[
|
557 |
-
metric_info["display_name"]
|
558 |
-
for metric_info in METRICS.values()
|
559 |
-
],
|
560 |
value=start_metric["display_name"],
|
561 |
label="Select Metric",
|
562 |
interactive=True,
|
@@ -586,38 +591,58 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
586 |
gr.Markdown(
|
587 |
"""
|
588 |
## Methodology
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
### Models
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
""",
|
605 |
container=True,
|
606 |
)
|
607 |
-
|
608 |
def update_component(fn, metric_choice):
|
609 |
metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
|
610 |
return fn(metric)
|
611 |
-
|
612 |
-
from functools import partial
|
613 |
|
614 |
-
|
615 |
-
metric.change(
|
|
|
|
|
|
|
|
|
616 |
metric.change(
|
617 |
-
fn=partial(update_component, create_model_comparison_plot),
|
|
|
|
|
618 |
)
|
619 |
metric.change(
|
620 |
-
fn=partial(update_component, create_scatter_plot),
|
|
|
|
|
621 |
)
|
622 |
metric.change(
|
623 |
fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
|
|
|
1 |
import json
|
2 |
+
from functools import partial
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
|
|
69 |
|
70 |
def create_leaderboard_df(metric):
|
71 |
# Sort languages by average BLEU to determine resource categories
|
72 |
+
langs_with_score = [
|
73 |
+
lang for lang in results if lang[metric["field_name"]] is not None
|
74 |
+
]
|
75 |
+
sorted_langs = sorted(
|
76 |
+
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
|
77 |
+
)
|
78 |
n_langs = len(sorted_langs)
|
79 |
high_cutoff = n_langs // 4 # top 25%
|
80 |
low_cutoff = n_langs - n_langs // 4 # bottom 25%
|
|
|
102 |
"Mid-Resource": [],
|
103 |
"Low-Resource": [],
|
104 |
}
|
105 |
+
model_scores[model][category].append(score[metric["field_name"]])
|
106 |
|
107 |
# Calculate average scores and create DataFrame
|
108 |
leaderboard_data = []
|
|
|
188 |
|
189 |
# Create appropriate title and y-axis label based on metric
|
190 |
title = f"{metric['display_name']} by Model and Language"
|
191 |
+
y_label = metric["label"]
|
192 |
|
193 |
# Flatten the data for the selected metric
|
194 |
scores_flat = []
|
195 |
for lang in top_languages:
|
196 |
for score in lang["scores"]:
|
197 |
# Get the value directly using the field name
|
198 |
+
value = score[metric["field_name"]]
|
199 |
if value is not None:
|
200 |
scores_flat.append(
|
201 |
{
|
|
|
297 |
for lang in filtered_results:
|
298 |
# Calculate average score for this metric across all models
|
299 |
scores = [
|
300 |
+
score[metric["field_name"]]
|
301 |
for score in lang["scores"]
|
302 |
+
if score[metric["field_name"]] is not None
|
303 |
]
|
304 |
if scores: # Only include if we have valid scores
|
305 |
avg_score = sum(scores) / len(scores)
|
|
|
337 |
fig.update_layout(
|
338 |
title=None,
|
339 |
xaxis_title="Number of Speakers (Millions)",
|
340 |
+
yaxis_title=metric["label"],
|
341 |
height=500,
|
342 |
showlegend=False,
|
343 |
)
|
|
|
373 |
data[t_code] = t_population
|
374 |
return data
|
375 |
|
376 |
+
|
377 |
# Helper functions for visualization
|
378 |
def make_black_bar(value, max_width=10):
|
379 |
filled = int(value * max_width)
|
|
|
402 |
else:
|
403 |
return "🟥" * filled + "⬜" * empty
|
404 |
|
405 |
+
|
406 |
def create_world_map(metric):
|
407 |
# Collect all country data
|
408 |
population_data = get_population_data()
|
409 |
country_data = {}
|
410 |
for lang in results:
|
411 |
# Skip languages without the required data
|
412 |
+
if "population" not in lang or lang[metric["field_name"]] is None:
|
413 |
continue
|
414 |
|
415 |
for country_code, speakers in lang["population"].items():
|
|
|
430 |
|
431 |
country_data[iso3_code]["total_speakers"] += speakers
|
432 |
country_data[iso3_code]["weighted_score_sum"] += (
|
433 |
+
speakers * lang[metric["field_name"]]
|
434 |
)
|
435 |
country_data[iso3_code]["languages"].append(
|
436 |
{
|
437 |
"name": lang["language_name"],
|
438 |
"speakers": speakers,
|
439 |
+
"score": lang[metric["field_name"]],
|
440 |
}
|
441 |
)
|
442 |
except (KeyError, AttributeError):
|
|
|
513 |
hoverinfo="text",
|
514 |
colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
|
515 |
colorbar=dict(
|
516 |
+
title=metric["label"],
|
517 |
orientation="h", # horizontal orientation
|
518 |
y=-0.2, # position below map
|
519 |
yanchor="bottom",
|
|
|
526 |
)
|
527 |
|
528 |
fig.update_layout(
|
529 |
+
title=dict(
|
530 |
+
text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"
|
531 |
+
),
|
532 |
geo=dict(
|
533 |
showframe=True,
|
534 |
showcoastlines=True,
|
|
|
549 |
|
550 |
return fig
|
551 |
|
552 |
+
|
553 |
def create_metric_explanation(metric):
|
554 |
+
return gr.Markdown(metric["explanation"])
|
555 |
|
556 |
|
557 |
# Create the visualization components
|
558 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
559 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
560 |
+
gr.Markdown("Comparing language proficiency across different models and languages.")
|
|
|
|
|
561 |
start_metric = METRICS["overall_performance"]
|
562 |
|
563 |
metric = gr.Dropdown(
|
564 |
+
choices=[metric_info["display_name"] for metric_info in METRICS.values()],
|
|
|
|
|
|
|
565 |
value=start_metric["display_name"],
|
566 |
label="Select Metric",
|
567 |
interactive=True,
|
|
|
591 |
gr.Markdown(
|
592 |
"""
|
593 |
## Methodology
|
594 |
+
|
595 |
+
### Benchmark Data
|
596 |
+
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
597 |
+
|
598 |
+
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
|
599 |
+
|
600 |
+
### AI Models
|
601 |
+
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
|
602 |
+
|
603 |
+
### Evaluation Tasks
|
604 |
+
Our benchmark includes three core tasks to assess different aspects of language understanding:
|
605 |
+
|
606 |
+
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
|
607 |
+
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
|
608 |
+
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
|
609 |
+
|
610 |
+
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
|
611 |
+
- Group sentences by URL into paragraphs with the same topic
|
612 |
+
- Use the 5 most common topics, encoded as numbers rather than English labels
|
613 |
+
- Provide 5 examples of each topic as few-shot examples
|
614 |
+
- Test the model's ability to classify new text
|
615 |
+
- Report accuracy as the primary metric
|
616 |
+
|
617 |
+
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
|
618 |
+
- Mask approximately 5% of each sentence at a random position
|
619 |
+
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
|
620 |
+
- Evaluate predictions using ChrF score against the original text
|
621 |
+
|
622 |
+
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
623 |
""",
|
624 |
container=True,
|
625 |
)
|
626 |
+
|
627 |
def update_component(fn, metric_choice):
|
628 |
metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
|
629 |
return fn(metric)
|
|
|
|
|
630 |
|
631 |
+
|
632 |
+
metric.change(
|
633 |
+
fn=partial(update_component, create_metric_explanation),
|
634 |
+
inputs=metric,
|
635 |
+
outputs=metric_explanation,
|
636 |
+
)
|
637 |
metric.change(
|
638 |
+
fn=partial(update_component, create_model_comparison_plot),
|
639 |
+
inputs=metric,
|
640 |
+
outputs=model_comparison_plot,
|
641 |
)
|
642 |
metric.change(
|
643 |
+
fn=partial(update_component, create_scatter_plot),
|
644 |
+
inputs=metric,
|
645 |
+
outputs=scatter_plot,
|
646 |
)
|
647 |
metric.change(
|
648 |
fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
|