|
import json |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
import plotly.express as px |
|
import pycountry |
|
|
|
with open("results.json") as f: |
|
results = json.load(f) |
|
|
|
|
|
def mean(lst): |
|
return sum(lst) / len(lst) |
|
|
|
|
|
def create_leaderboard_df(results): |
|
|
|
langs_with_bleu = [lang for lang in results if lang["bleu"] is not None] |
|
sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True) |
|
n_langs = len(sorted_langs) |
|
high_cutoff = n_langs // 4 |
|
low_cutoff = n_langs - n_langs // 4 |
|
|
|
|
|
high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]} |
|
low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]} |
|
|
|
|
|
model_scores = {} |
|
for lang in results: |
|
category = ( |
|
"High-Resource" |
|
if lang["language_name"] in high_resource |
|
else "Low-Resource" |
|
if lang["language_name"] in low_resource |
|
else "Mid-Resource" |
|
) |
|
|
|
for score in lang["scores"]: |
|
model = score["model"] |
|
if model not in model_scores: |
|
model_scores[model] = { |
|
"High-Resource": [], |
|
"Mid-Resource": [], |
|
"Low-Resource": [], |
|
} |
|
model_scores[model][category].append(score["bleu"]) |
|
|
|
|
|
leaderboard_data = [] |
|
for model, categories in model_scores.items(): |
|
|
|
high_avg = ( |
|
round(mean(categories["High-Resource"]), 3) |
|
if categories["High-Resource"] |
|
else 0 |
|
) |
|
mid_avg = ( |
|
round(mean(categories["Mid-Resource"]), 3) |
|
if categories["Mid-Resource"] |
|
else 0 |
|
) |
|
low_avg = ( |
|
round(mean(categories["Low-Resource"]), 3) |
|
if categories["Low-Resource"] |
|
else 0 |
|
) |
|
|
|
|
|
all_scores = ( |
|
categories["High-Resource"] |
|
+ categories["Mid-Resource"] |
|
+ categories["Low-Resource"] |
|
) |
|
overall_avg = round(sum(all_scores) / len(all_scores), 3) |
|
|
|
model_name = model.split("/")[-1] |
|
leaderboard_data.append( |
|
{ |
|
"Model": f"[{model_name}](https://openrouter.ai/{model})", |
|
"Overall BLEU": overall_avg, |
|
"High-Resource BLEU": high_avg, |
|
"Mid-Resource BLEU": mid_avg, |
|
"Low-Resource BLEU": low_avg, |
|
"Languages Tested": len(all_scores), |
|
} |
|
) |
|
|
|
|
|
df = pd.DataFrame(leaderboard_data) |
|
df = df.sort_values("Overall BLEU", ascending=False) |
|
|
|
|
|
df["Rank"] = range(1, len(df) + 1) |
|
df["Rank"] = df["Rank"].apply( |
|
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x) |
|
) |
|
|
|
|
|
df = df[ |
|
[ |
|
"Rank", |
|
"Model", |
|
"Overall BLEU", |
|
"High-Resource BLEU", |
|
"Mid-Resource BLEU", |
|
"Low-Resource BLEU", |
|
"Languages Tested", |
|
] |
|
] |
|
|
|
return gr.DataFrame( |
|
value=df, |
|
label="Model Leaderboard", |
|
show_search=False, |
|
datatype=[ |
|
"number", |
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
], |
|
) |
|
|
|
|
|
def create_model_comparison_plot(results): |
|
top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10] |
|
scores_flat = [ |
|
{"language": lang["language_name"], "model": score["model"], "bleu": score["bleu"]} |
|
for lang in top_languages |
|
for score in lang["scores"] |
|
] |
|
df = pd.DataFrame(scores_flat) |
|
fig = px.bar(df, x="language", y="bleu", color="model", barmode="group") |
|
fig.update_layout( |
|
title="BLEU Scores by Model and Language", |
|
xaxis_title=None, |
|
yaxis_title="BLEU Score", |
|
barmode="group", |
|
height=500, |
|
legend=dict( |
|
orientation="h", |
|
yanchor="bottom", |
|
y=-0.3, |
|
xanchor="center", |
|
x=0.5, |
|
), |
|
) |
|
return fig |
|
|
|
|
|
def create_language_stats_df(results): |
|
|
|
flat_data = [] |
|
|
|
for lang in results: |
|
|
|
best_score = max( |
|
lang["scores"] or [{"overall_score": None, "model": None}], key=lambda x: x["overall_score"] |
|
) |
|
|
|
model = best_score["model"] |
|
model_name = model.split("/")[-1] if model else "N/A" |
|
model_link = ( |
|
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" |
|
if model |
|
else "N/A" |
|
) |
|
commonvoice_link = ( |
|
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>" |
|
if lang["commonvoice_hours"] |
|
else "N/A" |
|
) |
|
row = { |
|
"Language": f"**{lang['language_name']}**", |
|
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1), |
|
|
|
"Overall": round(lang["overall_score"], 3) |
|
if lang["overall_score"] is not None |
|
else "N/A", |
|
"Trans-lation": round(lang["bleu"], 3) |
|
if lang["bleu"] is not None |
|
else "N/A", |
|
"Classi-fication": round(lang["accuracy"], 3) |
|
if lang["accuracy"] is not None |
|
else "N/A", |
|
"MLM": round(lang["mlm"], 3) |
|
if lang["mlm"] is not None |
|
else "N/A", |
|
"Best Model": model_link, |
|
"CommonVoice Hours": commonvoice_link, |
|
} |
|
flat_data.append(row) |
|
|
|
df = pd.DataFrame(flat_data) |
|
return gr.DataFrame( |
|
value=df, |
|
label="Language Results", |
|
show_search="search", |
|
datatype=[ |
|
"markdown", |
|
"number", |
|
|
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"markdown", |
|
"markdown", |
|
], |
|
) |
|
|
|
|
|
def create_scatter_plot(results): |
|
fig = go.Figure() |
|
|
|
x_vals = [ |
|
lang["speakers"] / 1_000_000 for lang in results if lang["speakers"] >= 10_000 |
|
] |
|
y_vals = [lang["bleu"] for lang in results] |
|
labels = [lang["language_name"] for lang in results] |
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=x_vals, |
|
y=y_vals, |
|
mode="markers+text", |
|
text=labels, |
|
textposition="top center", |
|
hovertemplate="<b>%{text}</b><br>" |
|
+ "Speakers: %{x:.1f}M<br>" |
|
+ "BLEU Score: %{y:.3f}<extra></extra>", |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title=None, |
|
xaxis_title="Number of Speakers (Millions)", |
|
yaxis_title="Average BLEU Score", |
|
height=500, |
|
showlegend=False, |
|
) |
|
|
|
|
|
fig.update_xaxes(type="log") |
|
|
|
return gr.Plot(value=fig, label="Speaker population vs BLEU") |
|
|
|
|
|
def format_number(n): |
|
"""Format number with K/M suffix""" |
|
if n >= 1_000_000: |
|
return f"{n/1_000_000:.1f}M" |
|
elif n >= 1_000: |
|
return f"{n/1_000:.0f}K" |
|
return str(n) |
|
|
|
def get_population_data(): |
|
import xml.etree.ElementTree as ET |
|
from language_data.util import data_filename |
|
|
|
filename = data_filename("supplementalData.xml") |
|
root = ET.fromstring(open(filename).read()) |
|
territories = root.findall("./territoryInfo/territory") |
|
|
|
data = {} |
|
for territory in territories: |
|
t_code = territory.attrib['type'] |
|
t_population = float(territory.attrib['population']) |
|
data[t_code] = t_population |
|
return data |
|
|
|
def create_world_map(results): |
|
|
|
population_data = get_population_data() |
|
country_data = {} |
|
for lang in results: |
|
if "population" not in lang or lang["bleu"] is None: |
|
continue |
|
|
|
for country_code, speakers in lang["population"].items(): |
|
try: |
|
|
|
country = pycountry.countries.get(alpha_2=country_code) |
|
if country is None: |
|
continue |
|
|
|
iso3_code = country.alpha_3 |
|
if iso3_code not in country_data: |
|
country_data[iso3_code] = { |
|
"total_speakers": 0, |
|
"population": population_data.get(country_code, 0), |
|
"weighted_bleu_sum": 0, |
|
"languages": [], |
|
} |
|
|
|
country_data[iso3_code]["total_speakers"] += speakers |
|
country_data[iso3_code]["weighted_bleu_sum"] += speakers * lang["bleu"] |
|
country_data[iso3_code]["languages"].append( |
|
{ |
|
"name": lang["language_name"], |
|
"speakers": speakers, |
|
"bleu": lang["bleu"], |
|
} |
|
) |
|
except (KeyError, AttributeError): |
|
|
|
continue |
|
|
|
|
|
countries = [] |
|
bleu_scores = [] |
|
hover_texts = [] |
|
|
|
def make_black_bar(value, max_width=10): |
|
filled = int(value * max_width) |
|
return "⬛️" * filled + "⬜️" * (max_width - filled) |
|
|
|
def make_colored_bar(value, max_width=10): |
|
"""Create a colored bar using Unicode blocks |
|
🟦 for high values (>0.35) |
|
🟨 for medium values (0.25-0.35) |
|
🟥 for low values (<0.25) |
|
⬜ for empty space |
|
""" |
|
filled = int(value * max_width) |
|
filled = max(0, min(filled, max_width)) |
|
empty = max_width - filled |
|
|
|
if value > 0.35: |
|
return "🟦" * filled + "⬜" * empty |
|
elif value > 0.25: |
|
return "🟨" * filled + "⬜" * empty |
|
else: |
|
return "🟥" * filled + "⬜" * empty |
|
|
|
for country_code, data in country_data.items(): |
|
weighted_avg = data["weighted_bleu_sum"] / data["total_speakers"] |
|
|
|
try: |
|
country_name = pycountry.countries.get(alpha_3=country_code).name |
|
except AttributeError: |
|
country_name = country_code |
|
|
|
|
|
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True) |
|
|
|
|
|
main_langs = langs[:5] |
|
other_langs = langs[5:] |
|
|
|
|
|
lang_rows = [] |
|
for lang in main_langs: |
|
percentage = (lang["speakers"] / data["population"]) * 100 |
|
speaker_bar = make_black_bar(percentage / 100) |
|
bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2) |
|
|
|
lang_rows.append( |
|
f"<b>{lang['name']}</b><br>" |
|
f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>" |
|
f"{bleu_bar} {lang['bleu']:.3f} BLEU<br>" |
|
) |
|
|
|
|
|
if other_langs: |
|
other_speakers = sum(lang["speakers"] for lang in other_langs) |
|
other_percentage = (other_speakers / data["population"]) * 100 |
|
other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len( |
|
other_langs |
|
) |
|
|
|
speaker_bar = make_black_bar(other_percentage / 100) |
|
bleu_bar = make_colored_bar((other_avg_bleu - 0.2) / 0.2) |
|
|
|
lang_rows.append( |
|
f"<b>+{len(other_langs)} other languages</b><br>" |
|
f"{speaker_bar} {format_number(other_speakers)} speakers<br>" |
|
f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>" |
|
) |
|
|
|
hover_text = ( |
|
f"<b>{country_name}</b><br><br>" |
|
f"{'<br>'.join(lang_rows)}" |
|
) |
|
|
|
countries.append(country_code) |
|
bleu_scores.append(weighted_avg) |
|
hover_texts.append(hover_text) |
|
|
|
|
|
fig = go.Figure( |
|
data=go.Choropleth( |
|
locations=countries, |
|
locationmode="ISO-3", |
|
z=bleu_scores, |
|
text=hover_texts, |
|
hoverinfo="text", |
|
colorscale=[[0, "#ff9999"], [1, "#99ccff"]], |
|
colorbar=dict( |
|
title="BLEU Score", |
|
orientation="h", |
|
y=-0.2, |
|
yanchor="bottom", |
|
len=0.5, |
|
x=0.5, |
|
xanchor="center", |
|
thickness=20, |
|
), |
|
zmin=0.1, |
|
zmax=0.5, |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title=dict(text="BLEU Score by Country", x=0.5, xanchor="center"), |
|
geo=dict( |
|
showframe=True, |
|
showcoastlines=True, |
|
projection_type="equal earth", |
|
showland=True, |
|
landcolor="#f8f9fa", |
|
coastlinecolor="#e0e0e0", |
|
countrycolor="#e0e0e0", |
|
), |
|
height=600, |
|
margin=dict(l=0, r=0, t=30, b=0), |
|
paper_bgcolor="white", |
|
hoverlabel=dict( |
|
bgcolor="beige", |
|
font_size=12, |
|
), |
|
) |
|
|
|
return fig |
|
|
|
|
|
|
|
with gr.Blocks(title="AI Language Translation Benchmark") as demo: |
|
gr.Markdown("# AI Language Translation Benchmark") |
|
gr.Markdown( |
|
"Comparing translation performance across different AI models and languages" |
|
) |
|
|
|
bar_plot = create_model_comparison_plot(results) |
|
world_map = create_world_map(results) |
|
|
|
create_leaderboard_df(results) |
|
gr.Plot(value=bar_plot, label="Model Comparison") |
|
create_language_stats_df(results) |
|
create_scatter_plot(results) |
|
gr.Plot(value=world_map, container=False, elem_classes="fullwidth-plot") |
|
|
|
gr.Markdown( |
|
""" |
|
## Methodology |
|
### Dataset |
|
- Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages |
|
- Each language is tested with the same 100 sentences |
|
- All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers |
|
- Language statistics sourced from Ethnologue and Wikidata |
|
|
|
### Models & Evaluation |
|
- Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed |
|
- **BLEU Score**: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better |
|
|
|
### Language Categories |
|
Languages are divided into three tiers based on translation difficulty: |
|
- High-Resource: Top 25% of languages by BLEU score (easiest to translate) |
|
- Mid-Resource: Middle 50% of languages |
|
- Low-Resource: Bottom 25% of languages (hardest to translate) |
|
""", |
|
container=True, |
|
) |
|
|
|
demo.launch() |
|
|