David Pomerenke
Improve methodology
63fd3b1
raw
history blame
22.7 kB
import json
from functools import partial
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import pycountry
with open("results.json") as f:
results = json.load(f)
# Global constants for metric mappings
METRICS = {
"overall_performance": {
"display_name": "Overall Performance",
"field_name": "overall_score",
"label": "Overall Performance Score",
"explanation": """
**Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
Higher scores indicate better overall language capabilities.
""",
},
"translation_bleu": {
"display_name": "Translation (BLEU)",
"field_name": "mt_bleu",
"label": "BLEU Score",
"explanation": """
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
""",
},
"translation_chrf": {
"display_name": "Translation (ChrF)",
"field_name": "mt_chrf",
"label": "ChrF Score",
"explanation": """
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
Higher scores (0-1) indicate better translations.
""",
},
"classification_accuracy": {
"display_name": "Classification (Accuracy)",
"field_name": "cls_acc",
"label": "Classification Accuracy",
"explanation": """
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
This evaluates a model's understanding of content and context across different languages.
Reported as a percentage where higher values indicate better classification performance.
""",
},
"mlm_chrf": {
"display_name": "Masked Language Modeling (ChrF)",
"field_name": "mlm_chrf",
"label": "MLM ChrF Score",
"explanation": """
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
This tests a model's understanding of language structure and semantics by measuring the character-level similarity
between predicted and actual text. Higher scores indicate better language understanding.
""",
},
}
def mean(lst):
return sum(lst) / len(lst)
def create_leaderboard_df(metric):
# Sort languages by average BLEU to determine resource categories
langs_with_score = [
lang for lang in results if lang[metric["field_name"]] is not None
]
sorted_langs = sorted(
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
)
n_langs = len(sorted_langs)
high_cutoff = n_langs // 4 # top 25%
low_cutoff = n_langs - n_langs // 4 # bottom 25%
# Create sets of languages for each category
high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}
# Get all model scores with categorization
model_scores = {}
for lang in results:
category = (
"High-Resource"
if lang["language_name"] in high_resource
else "Low-Resource"
if lang["language_name"] in low_resource
else "Mid-Resource"
)
for score in lang["scores"]:
model = score["model"]
if model not in model_scores:
model_scores[model] = {
"High-Resource": [],
"Mid-Resource": [],
"Low-Resource": [],
}
model_scores[model][category].append(score[metric["field_name"]])
# Calculate average scores and create DataFrame
leaderboard_data = []
for model, categories in model_scores.items():
# Calculate averages for each category
high_avg = (
round(mean(categories["High-Resource"]), 3)
if categories["High-Resource"]
else 0
)
mid_avg = (
round(mean(categories["Mid-Resource"]), 3)
if categories["Mid-Resource"]
else 0
)
low_avg = (
round(mean(categories["Low-Resource"]), 3)
if categories["Low-Resource"]
else 0
)
# Calculate overall average
all_scores = (
categories["High-Resource"]
+ categories["Mid-Resource"]
+ categories["Low-Resource"]
)
overall_avg = round(sum(all_scores) / len(all_scores), 3)
model_name = model.split("/")[-1]
leaderboard_data.append(
{
"Model": f"[{model_name}](https://openrouter.ai/{model})",
"Overall Score": overall_avg,
"High-Resource Score": high_avg,
"Mid-Resource Score": mid_avg,
"Low-Resource Score": low_avg,
"Languages Tested": len(all_scores),
}
)
# Sort by overall BLEU
df = pd.DataFrame(leaderboard_data)
df = df.sort_values("Overall Score", ascending=False)
# Add rank and medals
df["Rank"] = range(1, len(df) + 1)
df["Rank"] = df["Rank"].apply(
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
)
# Reorder columns
df = df[
[
"Rank",
"Model",
"Overall Score",
"High-Resource Score",
"Mid-Resource Score",
"Low-Resource Score",
"Languages Tested",
]
]
return gr.DataFrame(
value=df,
label="Model Leaderboard",
show_search=False,
datatype=[
"number",
"markdown",
"number",
"number",
"number",
"number",
"number",
],
)
def create_model_comparison_plot(metric):
top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]
# Create appropriate title and y-axis label based on metric
title = f"{metric['display_name']} by Model and Language"
y_label = metric["label"]
# Flatten the data for the selected metric
scores_flat = []
for lang in top_languages:
for score in lang["scores"]:
# Get the value directly using the field name
value = score[metric["field_name"]]
if value is not None:
scores_flat.append(
{
"language": lang["language_name"],
"model": score["model"],
"value": value,
}
)
df = pd.DataFrame(scores_flat)
fig = px.bar(df, x="language", y="value", color="model", barmode="group")
fig.update_layout(
title=title,
xaxis_title=None,
yaxis_title=y_label,
barmode="group",
height=500,
legend=dict(
orientation="h", # horizontal orientation
yanchor="bottom",
y=-0.3, # position below plot
xanchor="center",
x=0.5, # center horizontally
),
)
return fig
def create_language_stats_df(metric):
# Create a list to store flattened data
flat_data = []
for lang in results:
# Find the best model and its BLEU score
best_model = max(
lang["scores"] or [{"overall_score": None, "model": None}],
key=lambda x: x["overall_score"],
)
model = best_model["model"]
model_name = model.split("/")[-1] if model else "N/A"
model_link = (
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
if model
else "N/A"
)
commonvoice_link = (
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>"
if lang["commonvoice_hours"]
else "N/A"
)
row = {
"Language": f"**{lang['language_name']}**",
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
# "Models Tested": len(lang["scores"]),
"Overall": round(lang["overall_score"], 3)
if lang["overall_score"] is not None
else "N/A",
"Trans-lation": round(lang["mt_bleu"], 3)
if lang["mt_bleu"] is not None
else "N/A",
"Classi-fication": round(lang["cls_acc"], 3)
if lang["cls_acc"] is not None
else "N/A",
"MLM": round(lang["mlm_chrf"], 3)
if lang["mlm_chrf"] is not None
else "N/A",
"Best Model": model_link,
"CommonVoice Hours": commonvoice_link,
}
flat_data.append(row)
df = pd.DataFrame(flat_data)
return gr.DataFrame(
value=df,
label="Language Results",
show_search="search",
datatype=[
"markdown", # Language
"number", # Speakers
# "number", # Models Tested
"number", # Overall
"number", # Translation
"number", # Classification
"number", # MLM
"markdown", # Best Model
"markdown", # CommonVoice Hours
],
)
def create_scatter_plot(metric):
# Filter results to include only languages with sufficient speakers
filtered_results = [lang for lang in results if lang["speakers"] >= 10_000]
# Create a list to store data for the scatter plot
scatter_data = []
for lang in filtered_results:
# Calculate average score for this metric across all models
scores = [
score[metric["field_name"]]
for score in lang["scores"]
if score[metric["field_name"]] is not None
]
if scores: # Only include if we have valid scores
avg_score = sum(scores) / len(scores)
scatter_data.append(
{
"language": lang["language_name"],
"speakers": lang["speakers"],
"score": avg_score,
}
)
fig = go.Figure()
# Convert speakers to millions for display
x_vals = [
data["speakers"] / 1_000_000 for data in scatter_data
] # Convert to millions
y_vals = [data["score"] for data in scatter_data]
labels = [data["language"] for data in scatter_data]
# Create hover template
hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
fig.add_trace(
go.Scatter(
x=x_vals,
y=y_vals,
mode="markers+text",
text=labels,
textposition="top center",
hovertemplate=hover_template,
)
)
fig.update_layout(
title=None,
xaxis_title="Number of Speakers (Millions)",
yaxis_title=metric["label"],
height=500,
showlegend=False,
)
# Use log scale for x-axis since speaker numbers vary widely
fig.update_xaxes(type="log")
return fig
def format_number(n):
"""Format number with K/M suffix"""
if n >= 1_000_000:
return f"{n/1_000_000:.1f}M"
elif n >= 1_000:
return f"{n/1_000:.0f}K"
return str(n)
def get_population_data():
import xml.etree.ElementTree as ET
from language_data.util import data_filename
filename = data_filename("supplementalData.xml")
root = ET.fromstring(open(filename).read())
territories = root.findall("./territoryInfo/territory")
data = {}
for territory in territories:
t_code = territory.attrib["type"]
t_population = float(territory.attrib["population"])
data[t_code] = t_population
return data
# Helper functions for visualization
def make_black_bar(value, max_width=10):
filled = int(value * max_width)
return "⬛️" * filled + "⬜️" * (max_width - filled)
def make_colored_bar(score, max_width=10):
"""Create a colored bar using Unicode blocks based on normalized score
🟦 for high values (>0.35)
🟨 for medium values (0.25-0.35)
🟥 for low values (<0.25)
⬜ for empty space
This function handles both normalization and bar creation.
"""
# Create the bar based on normalized value
filled = int(score * max_width)
filled = max(0, min(filled, max_width))
empty = max_width - filled
if score > 0.35:
return "🟦" * filled + "⬜" * empty
elif score > 0.25:
return "🟨" * filled + "⬜" * empty
else:
return "🟥" * filled + "⬜" * empty
def create_world_map(metric):
# Collect all country data
population_data = get_population_data()
country_data = {}
for lang in results:
# Skip languages without the required data
if "population" not in lang or lang[metric["field_name"]] is None:
continue
for country_code, speakers in lang["population"].items():
try:
# Convert alpha_2 (2-letter) to alpha_3 (3-letter) code
country = pycountry.countries.get(alpha_2=country_code)
if country is None:
continue
iso3_code = country.alpha_3
if iso3_code not in country_data:
country_data[iso3_code] = {
"total_speakers": 0,
"population": population_data.get(country_code, 0),
"weighted_score_sum": 0,
"languages": [],
}
country_data[iso3_code]["total_speakers"] += speakers
country_data[iso3_code]["weighted_score_sum"] += (
speakers * lang[metric["field_name"]]
)
country_data[iso3_code]["languages"].append(
{
"name": lang["language_name"],
"speakers": speakers,
"score": lang[metric["field_name"]],
}
)
except (KeyError, AttributeError):
# Skip invalid or unrecognized country codes
continue
# Calculate final weighted averages and prepare hover text
countries = []
scores = []
hover_texts = []
for country_code, data in country_data.items():
weighted_avg = data["weighted_score_sum"] / data["total_speakers"]
try:
country_name = pycountry.countries.get(alpha_3=country_code).name
except AttributeError:
country_name = country_code
# Sort languages by number of speakers
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
# Take top 5 languages and summarize the rest
main_langs = langs[:5]
other_langs = langs[5:]
# Create language rows with bars
lang_rows = []
for lang in main_langs:
percentage = (lang["speakers"] / data["population"]) * 100
speaker_bar = make_black_bar(percentage / 100)
# Use the integrated make_colored_bar function directly
score_bar = make_colored_bar(lang["score"])
lang_rows.append(
f"<b>{lang['name']}</b><br>"
f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
f"{score_bar} {lang['score']:.3f} {metric['label']}<br>"
)
# Add summary for other languages if any
if other_langs:
other_speakers = sum(lang["speakers"] for lang in other_langs)
other_percentage = (other_speakers / data["population"]) * 100
other_avg_score = sum(lang["score"] for lang in other_langs) / len(
other_langs
)
speaker_bar = make_black_bar(other_percentage / 100)
# Use the integrated make_colored_bar function directly
score_bar = make_colored_bar(other_avg_score)
lang_rows.append(
f"<b>+{len(other_langs)} other languages</b><br>"
f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>"
)
hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}"
countries.append(country_code)
scores.append(weighted_avg)
hover_texts.append(hover_text)
# Create the choropleth map
fig = go.Figure(
data=go.Choropleth(
locations=countries,
locationmode="ISO-3",
z=scores,
text=hover_texts,
hoverinfo="text",
colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
colorbar=dict(
title=metric["label"],
orientation="h", # horizontal orientation
y=-0.2, # position below map
yanchor="bottom",
len=0.5, # length of colorbar
x=0.5, # center horizontally
xanchor="center",
thickness=20, # make it a bit thicker when horizontal
),
)
)
fig.update_layout(
title=dict(
text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"
),
geo=dict(
showframe=True,
showcoastlines=True,
projection_type="equal earth",
showland=True,
landcolor="#f8f9fa",
coastlinecolor="#e0e0e0",
countrycolor="#e0e0e0",
),
height=600,
margin=dict(l=0, r=0, t=30, b=0),
paper_bgcolor="white",
hoverlabel=dict(
bgcolor="beige",
font_size=12,
),
)
return fig
def create_metric_explanation(metric):
return gr.Markdown(metric["explanation"])
# Create the visualization components
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
gr.Markdown("# AI Language Proficiency Benchmark")
gr.Markdown("Comparing language proficiency across different models and languages.")
start_metric = METRICS["overall_performance"]
metric = gr.Dropdown(
choices=[metric_info["display_name"] for metric_info in METRICS.values()],
value=start_metric["display_name"],
label="Select Metric",
interactive=True,
)
metric_explanation = create_metric_explanation(start_metric)
gr.Markdown("## Model Comparison")
create_leaderboard_df(start_metric)
model_comparison_plot = gr.Plot(
value=create_model_comparison_plot(start_metric),
label="Model Comparison",
)
gr.Markdown("## Language Stats")
create_language_stats_df(start_metric)
scatter_plot = gr.Plot(
value=create_scatter_plot(start_metric),
label="Speaker Population vs. Metric",
)
world_map = gr.Plot(
value=create_world_map(start_metric),
label="World Map",
container=False,
elem_classes="fullwidth-plot",
)
gr.Markdown(
"""
## Methodology
### Benchmark Data
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
### AI Models
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
### Evaluation Tasks
Our benchmark includes three core tasks to assess different aspects of language understanding:
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
- Group sentences by URL into paragraphs with the same topic
- Use the 5 most common topics, encoded as numbers rather than English labels
- Provide 5 examples of each topic as few-shot examples
- Test the model's ability to classify new text
- Report accuracy as the primary metric
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
- Mask approximately 5% of each sentence at a random position
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
- Evaluate predictions using ChrF score against the original text
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
""",
container=True,
)
def update_component(fn, metric_choice):
metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
return fn(metric)
metric.change(
fn=partial(update_component, create_metric_explanation),
inputs=metric,
outputs=metric_explanation,
)
metric.change(
fn=partial(update_component, create_model_comparison_plot),
inputs=metric,
outputs=model_comparison_plot,
)
metric.change(
fn=partial(update_component, create_scatter_plot),
inputs=metric,
outputs=scatter_plot,
)
metric.change(
fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
)
demo.launch()