|
import json |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
import pycountry |
|
|
|
with open("results.json") as f: |
|
results = json.load(f) |
|
|
|
|
|
METRICS = { |
|
"overall_performance": { |
|
"display_name": "Overall Performance", |
|
"field_name": "overall_score", |
|
"label": "Overall Performance Score", |
|
"explanation": """ |
|
**Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks. |
|
Higher scores indicate better overall language capabilities. |
|
""", |
|
}, |
|
"translation_bleu": { |
|
"display_name": "Translation (BLEU)", |
|
"field_name": "mt_bleu", |
|
"label": "BLEU Score", |
|
"explanation": """ |
|
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations. |
|
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality. |
|
""", |
|
}, |
|
"translation_chrf": { |
|
"display_name": "Translation (ChrF)", |
|
"field_name": "mt_chrf", |
|
"label": "ChrF Score", |
|
"explanation": """ |
|
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level. |
|
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches. |
|
Higher scores (0-1) indicate better translations. |
|
""", |
|
}, |
|
"classification_accuracy": { |
|
"display_name": "Classification (Accuracy)", |
|
"field_name": "cls_acc", |
|
"label": "Classification Accuracy", |
|
"explanation": """ |
|
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories. |
|
This evaluates a model's understanding of content and context across different languages. |
|
Reported as a percentage where higher values indicate better classification performance. |
|
""", |
|
}, |
|
"mlm_chrf": { |
|
"display_name": "Masked Language Modeling (ChrF)", |
|
"field_name": "mlm_chrf", |
|
"label": "MLM ChrF Score", |
|
"explanation": """ |
|
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text. |
|
This tests a model's understanding of language structure and semantics by measuring the character-level similarity |
|
between predicted and actual text. Higher scores indicate better language understanding. |
|
""", |
|
}, |
|
} |
|
|
|
|
|
def mean(lst): |
|
return sum(lst) / len(lst) |
|
|
|
|
|
def create_leaderboard_df(metric): |
|
|
|
langs_with_score = [lang for lang in results if lang[metric['field_name']] is not None] |
|
sorted_langs = sorted(langs_with_score, key=lambda x: x[metric['field_name']], reverse=True) |
|
n_langs = len(sorted_langs) |
|
high_cutoff = n_langs // 4 |
|
low_cutoff = n_langs - n_langs // 4 |
|
|
|
|
|
high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]} |
|
low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]} |
|
|
|
|
|
model_scores = {} |
|
for lang in results: |
|
category = ( |
|
"High-Resource" |
|
if lang["language_name"] in high_resource |
|
else "Low-Resource" |
|
if lang["language_name"] in low_resource |
|
else "Mid-Resource" |
|
) |
|
|
|
for score in lang["scores"]: |
|
model = score["model"] |
|
if model not in model_scores: |
|
model_scores[model] = { |
|
"High-Resource": [], |
|
"Mid-Resource": [], |
|
"Low-Resource": [], |
|
} |
|
model_scores[model][category].append(score[metric['field_name']]) |
|
|
|
|
|
leaderboard_data = [] |
|
for model, categories in model_scores.items(): |
|
|
|
high_avg = ( |
|
round(mean(categories["High-Resource"]), 3) |
|
if categories["High-Resource"] |
|
else 0 |
|
) |
|
mid_avg = ( |
|
round(mean(categories["Mid-Resource"]), 3) |
|
if categories["Mid-Resource"] |
|
else 0 |
|
) |
|
low_avg = ( |
|
round(mean(categories["Low-Resource"]), 3) |
|
if categories["Low-Resource"] |
|
else 0 |
|
) |
|
|
|
|
|
all_scores = ( |
|
categories["High-Resource"] |
|
+ categories["Mid-Resource"] |
|
+ categories["Low-Resource"] |
|
) |
|
overall_avg = round(sum(all_scores) / len(all_scores), 3) |
|
|
|
model_name = model.split("/")[-1] |
|
leaderboard_data.append( |
|
{ |
|
"Model": f"[{model_name}](https://openrouter.ai/{model})", |
|
"Overall Score": overall_avg, |
|
"High-Resource Score": high_avg, |
|
"Mid-Resource Score": mid_avg, |
|
"Low-Resource Score": low_avg, |
|
"Languages Tested": len(all_scores), |
|
} |
|
) |
|
|
|
|
|
df = pd.DataFrame(leaderboard_data) |
|
df = df.sort_values("Overall Score", ascending=False) |
|
|
|
|
|
df["Rank"] = range(1, len(df) + 1) |
|
df["Rank"] = df["Rank"].apply( |
|
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x) |
|
) |
|
|
|
|
|
df = df[ |
|
[ |
|
"Rank", |
|
"Model", |
|
"Overall Score", |
|
"High-Resource Score", |
|
"Mid-Resource Score", |
|
"Low-Resource Score", |
|
"Languages Tested", |
|
] |
|
] |
|
|
|
return gr.DataFrame( |
|
value=df, |
|
label="Model Leaderboard", |
|
show_search=False, |
|
datatype=[ |
|
"number", |
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
], |
|
) |
|
|
|
|
|
def create_model_comparison_plot(metric): |
|
top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10] |
|
|
|
|
|
title = f"{metric['display_name']} by Model and Language" |
|
y_label = metric['label'] |
|
|
|
|
|
scores_flat = [] |
|
for lang in top_languages: |
|
for score in lang["scores"]: |
|
|
|
value = score[metric['field_name']] |
|
if value is not None: |
|
scores_flat.append( |
|
{ |
|
"language": lang["language_name"], |
|
"model": score["model"], |
|
"value": value, |
|
} |
|
) |
|
|
|
df = pd.DataFrame(scores_flat) |
|
fig = px.bar(df, x="language", y="value", color="model", barmode="group") |
|
fig.update_layout( |
|
title=title, |
|
xaxis_title=None, |
|
yaxis_title=y_label, |
|
barmode="group", |
|
height=500, |
|
legend=dict( |
|
orientation="h", |
|
yanchor="bottom", |
|
y=-0.3, |
|
xanchor="center", |
|
x=0.5, |
|
), |
|
) |
|
return fig |
|
|
|
|
|
def create_language_stats_df(metric): |
|
|
|
flat_data = [] |
|
|
|
for lang in results: |
|
|
|
best_model = max( |
|
lang["scores"] or [{"overall_score": None, "model": None}], |
|
key=lambda x: x["overall_score"], |
|
) |
|
|
|
model = best_model["model"] |
|
model_name = model.split("/")[-1] if model else "N/A" |
|
model_link = ( |
|
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" |
|
if model |
|
else "N/A" |
|
) |
|
commonvoice_link = ( |
|
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>" |
|
if lang["commonvoice_hours"] |
|
else "N/A" |
|
) |
|
row = { |
|
"Language": f"**{lang['language_name']}**", |
|
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1), |
|
|
|
"Overall": round(lang["overall_score"], 3) |
|
if lang["overall_score"] is not None |
|
else "N/A", |
|
"Trans-lation": round(lang["mt_bleu"], 3) |
|
if lang["mt_bleu"] is not None |
|
else "N/A", |
|
"Classi-fication": round(lang["cls_acc"], 3) |
|
if lang["cls_acc"] is not None |
|
else "N/A", |
|
"MLM": round(lang["mlm_chrf"], 3) |
|
if lang["mlm_chrf"] is not None |
|
else "N/A", |
|
"Best Model": model_link, |
|
"CommonVoice Hours": commonvoice_link, |
|
} |
|
flat_data.append(row) |
|
|
|
df = pd.DataFrame(flat_data) |
|
return gr.DataFrame( |
|
value=df, |
|
label="Language Results", |
|
show_search="search", |
|
datatype=[ |
|
"markdown", |
|
"number", |
|
|
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"markdown", |
|
"markdown", |
|
], |
|
) |
|
|
|
|
|
def create_scatter_plot(metric): |
|
|
|
filtered_results = [lang for lang in results if lang["speakers"] >= 10_000] |
|
|
|
|
|
scatter_data = [] |
|
|
|
for lang in filtered_results: |
|
|
|
scores = [ |
|
score[metric['field_name']] |
|
for score in lang["scores"] |
|
if score[metric['field_name']] is not None |
|
] |
|
if scores: |
|
avg_score = sum(scores) / len(scores) |
|
scatter_data.append( |
|
{ |
|
"language": lang["language_name"], |
|
"speakers": lang["speakers"], |
|
"score": avg_score, |
|
} |
|
) |
|
|
|
fig = go.Figure() |
|
|
|
|
|
x_vals = [ |
|
data["speakers"] / 1_000_000 for data in scatter_data |
|
] |
|
y_vals = [data["score"] for data in scatter_data] |
|
labels = [data["language"] for data in scatter_data] |
|
|
|
|
|
hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>" |
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=x_vals, |
|
y=y_vals, |
|
mode="markers+text", |
|
text=labels, |
|
textposition="top center", |
|
hovertemplate=hover_template, |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title=None, |
|
xaxis_title="Number of Speakers (Millions)", |
|
yaxis_title=metric['label'], |
|
height=500, |
|
showlegend=False, |
|
) |
|
|
|
|
|
fig.update_xaxes(type="log") |
|
|
|
return fig |
|
|
|
|
|
def format_number(n): |
|
"""Format number with K/M suffix""" |
|
if n >= 1_000_000: |
|
return f"{n/1_000_000:.1f}M" |
|
elif n >= 1_000: |
|
return f"{n/1_000:.0f}K" |
|
return str(n) |
|
|
|
|
|
def get_population_data(): |
|
import xml.etree.ElementTree as ET |
|
|
|
from language_data.util import data_filename |
|
|
|
filename = data_filename("supplementalData.xml") |
|
root = ET.fromstring(open(filename).read()) |
|
territories = root.findall("./territoryInfo/territory") |
|
|
|
data = {} |
|
for territory in territories: |
|
t_code = territory.attrib["type"] |
|
t_population = float(territory.attrib["population"]) |
|
data[t_code] = t_population |
|
return data |
|
|
|
|
|
def make_black_bar(value, max_width=10): |
|
filled = int(value * max_width) |
|
return "⬛️" * filled + "⬜️" * (max_width - filled) |
|
|
|
|
|
def make_colored_bar(score, max_width=10): |
|
"""Create a colored bar using Unicode blocks based on normalized score |
|
🟦 for high values (>0.35) |
|
🟨 for medium values (0.25-0.35) |
|
🟥 for low values (<0.25) |
|
⬜ for empty space |
|
|
|
This function handles both normalization and bar creation. |
|
""" |
|
|
|
|
|
filled = int(score * max_width) |
|
filled = max(0, min(filled, max_width)) |
|
empty = max_width - filled |
|
|
|
if score > 0.35: |
|
return "🟦" * filled + "⬜" * empty |
|
elif score > 0.25: |
|
return "🟨" * filled + "⬜" * empty |
|
else: |
|
return "🟥" * filled + "⬜" * empty |
|
|
|
def create_world_map(metric): |
|
|
|
population_data = get_population_data() |
|
country_data = {} |
|
for lang in results: |
|
|
|
if "population" not in lang or lang[metric['field_name']] is None: |
|
continue |
|
|
|
for country_code, speakers in lang["population"].items(): |
|
try: |
|
|
|
country = pycountry.countries.get(alpha_2=country_code) |
|
if country is None: |
|
continue |
|
|
|
iso3_code = country.alpha_3 |
|
if iso3_code not in country_data: |
|
country_data[iso3_code] = { |
|
"total_speakers": 0, |
|
"population": population_data.get(country_code, 0), |
|
"weighted_score_sum": 0, |
|
"languages": [], |
|
} |
|
|
|
country_data[iso3_code]["total_speakers"] += speakers |
|
country_data[iso3_code]["weighted_score_sum"] += ( |
|
speakers * lang[metric['field_name']] |
|
) |
|
country_data[iso3_code]["languages"].append( |
|
{ |
|
"name": lang["language_name"], |
|
"speakers": speakers, |
|
"score": lang[metric['field_name']], |
|
} |
|
) |
|
except (KeyError, AttributeError): |
|
|
|
continue |
|
|
|
|
|
countries = [] |
|
scores = [] |
|
hover_texts = [] |
|
|
|
for country_code, data in country_data.items(): |
|
weighted_avg = data["weighted_score_sum"] / data["total_speakers"] |
|
|
|
try: |
|
country_name = pycountry.countries.get(alpha_3=country_code).name |
|
except AttributeError: |
|
country_name = country_code |
|
|
|
|
|
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True) |
|
|
|
|
|
main_langs = langs[:5] |
|
other_langs = langs[5:] |
|
|
|
|
|
lang_rows = [] |
|
for lang in main_langs: |
|
percentage = (lang["speakers"] / data["population"]) * 100 |
|
speaker_bar = make_black_bar(percentage / 100) |
|
|
|
|
|
score_bar = make_colored_bar(lang["score"]) |
|
|
|
lang_rows.append( |
|
f"<b>{lang['name']}</b><br>" |
|
f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>" |
|
f"{score_bar} {lang['score']:.3f} {metric['label']}<br>" |
|
) |
|
|
|
|
|
if other_langs: |
|
other_speakers = sum(lang["speakers"] for lang in other_langs) |
|
other_percentage = (other_speakers / data["population"]) * 100 |
|
other_avg_score = sum(lang["score"] for lang in other_langs) / len( |
|
other_langs |
|
) |
|
|
|
speaker_bar = make_black_bar(other_percentage / 100) |
|
|
|
|
|
score_bar = make_colored_bar(other_avg_score) |
|
|
|
lang_rows.append( |
|
f"<b>+{len(other_langs)} other languages</b><br>" |
|
f"{speaker_bar} {format_number(other_speakers)} speakers<br>" |
|
f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>" |
|
) |
|
|
|
hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}" |
|
|
|
countries.append(country_code) |
|
scores.append(weighted_avg) |
|
hover_texts.append(hover_text) |
|
|
|
|
|
fig = go.Figure( |
|
data=go.Choropleth( |
|
locations=countries, |
|
locationmode="ISO-3", |
|
z=scores, |
|
text=hover_texts, |
|
hoverinfo="text", |
|
colorscale=[[0, "#ff9999"], [1, "#99ccff"]], |
|
colorbar=dict( |
|
title=metric['label'], |
|
orientation="h", |
|
y=-0.2, |
|
yanchor="bottom", |
|
len=0.5, |
|
x=0.5, |
|
xanchor="center", |
|
thickness=20, |
|
), |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title=dict(text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"), |
|
geo=dict( |
|
showframe=True, |
|
showcoastlines=True, |
|
projection_type="equal earth", |
|
showland=True, |
|
landcolor="#f8f9fa", |
|
coastlinecolor="#e0e0e0", |
|
countrycolor="#e0e0e0", |
|
), |
|
height=600, |
|
margin=dict(l=0, r=0, t=30, b=0), |
|
paper_bgcolor="white", |
|
hoverlabel=dict( |
|
bgcolor="beige", |
|
font_size=12, |
|
), |
|
) |
|
|
|
return fig |
|
|
|
def create_metric_explanation(metric): |
|
return gr.Markdown(metric['explanation']) |
|
|
|
|
|
|
|
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo: |
|
gr.Markdown("# AI Language Proficiency Benchmark") |
|
gr.Markdown( |
|
"Comparing language proficiency across different models and languages." |
|
) |
|
start_metric = METRICS["overall_performance"] |
|
|
|
metric = gr.Dropdown( |
|
choices=[ |
|
metric_info["display_name"] |
|
for metric_info in METRICS.values() |
|
], |
|
value=start_metric["display_name"], |
|
label="Select Metric", |
|
interactive=True, |
|
) |
|
metric_explanation = create_metric_explanation(start_metric) |
|
|
|
gr.Markdown("## Model Comparison") |
|
create_leaderboard_df(start_metric) |
|
model_comparison_plot = gr.Plot( |
|
value=create_model_comparison_plot(start_metric), |
|
label="Model Comparison", |
|
) |
|
|
|
gr.Markdown("## Language Stats") |
|
create_language_stats_df(start_metric) |
|
scatter_plot = gr.Plot( |
|
value=create_scatter_plot(start_metric), |
|
label="Speaker Population vs. Metric", |
|
) |
|
world_map = gr.Plot( |
|
value=create_world_map(start_metric), |
|
label="World Map", |
|
container=False, |
|
elem_classes="fullwidth-plot", |
|
) |
|
|
|
gr.Markdown( |
|
""" |
|
## Methodology |
|
### Dataset |
|
- Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages |
|
- Each language is tested with the same 100 sentences |
|
- All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers |
|
- Language statistics sourced from Ethnologue and Wikidata |
|
|
|
### Models & Evaluation |
|
- Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed |
|
- **BLEU Score**: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better |
|
|
|
### Language Categories |
|
Languages are divided into three tiers based on translation difficulty: |
|
- High-Resource: Top 25% of languages by BLEU score (easiest to translate) |
|
- Mid-Resource: Middle 50% of languages |
|
- Low-Resource: Bottom 25% of languages (hardest to translate) |
|
""", |
|
container=True, |
|
) |
|
|
|
def update_component(fn, metric_choice): |
|
metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0] |
|
return fn(metric) |
|
|
|
from functools import partial |
|
|
|
|
|
metric.change(fn=partial(update_component, create_metric_explanation), inputs=metric, outputs=metric_explanation) |
|
metric.change( |
|
fn=partial(update_component, create_model_comparison_plot), inputs=metric, outputs=model_comparison_plot |
|
) |
|
metric.change( |
|
fn=partial(update_component, create_scatter_plot), inputs=metric, outputs=scatter_plot |
|
) |
|
metric.change( |
|
fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map |
|
) |
|
|
|
demo.launch() |
|
|