|
import json |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.graph_objects as go |
|
|
|
with open("results.json") as f: |
|
results = json.load(f) |
|
|
|
|
|
def mean(lst): |
|
return sum(lst) / len(lst) |
|
|
|
|
|
def create_leaderboard_df(results): |
|
|
|
langs_with_bleu = [lang for lang in results if lang["bleu"] is not None] |
|
sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True) |
|
n_langs = len(sorted_langs) |
|
high_cutoff = n_langs // 4 |
|
low_cutoff = n_langs - n_langs // 4 |
|
|
|
|
|
high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]} |
|
low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]} |
|
|
|
|
|
model_scores = {} |
|
for lang in results: |
|
category = ( |
|
"High-Resource" |
|
if lang["language_name"] in high_resource |
|
else "Low-Resource" |
|
if lang["language_name"] in low_resource |
|
else "Mid-Resource" |
|
) |
|
|
|
for score in lang["scores"]: |
|
model = score["model"] |
|
if model not in model_scores: |
|
model_scores[model] = { |
|
"High-Resource": [], |
|
"Mid-Resource": [], |
|
"Low-Resource": [], |
|
} |
|
model_scores[model][category].append(score["bleu"]) |
|
|
|
|
|
leaderboard_data = [] |
|
for model, categories in model_scores.items(): |
|
|
|
high_avg = ( |
|
round(mean(categories["High-Resource"]), 3) |
|
if categories["High-Resource"] |
|
else 0 |
|
) |
|
mid_avg = ( |
|
round(mean(categories["Mid-Resource"]), 3) |
|
if categories["Mid-Resource"] |
|
else 0 |
|
) |
|
low_avg = ( |
|
round(mean(categories["Low-Resource"]), 3) |
|
if categories["Low-Resource"] |
|
else 0 |
|
) |
|
|
|
|
|
all_scores = ( |
|
categories["High-Resource"] |
|
+ categories["Mid-Resource"] |
|
+ categories["Low-Resource"] |
|
) |
|
overall_avg = round(sum(all_scores) / len(all_scores), 3) |
|
|
|
model_name = model.split("/")[-1] |
|
leaderboard_data.append( |
|
{ |
|
"Model": f"[{model_name}](https://openrouter.ai/{model})", |
|
"Overall BLEU": overall_avg, |
|
"High-Resource BLEU": high_avg, |
|
"Mid-Resource BLEU": mid_avg, |
|
"Low-Resource BLEU": low_avg, |
|
"Languages Tested": len(all_scores), |
|
} |
|
) |
|
|
|
|
|
df = pd.DataFrame(leaderboard_data) |
|
df = df.sort_values("Overall BLEU", ascending=False) |
|
|
|
|
|
df["Rank"] = range(1, len(df) + 1) |
|
df["Rank"] = df["Rank"].apply( |
|
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x) |
|
) |
|
|
|
|
|
df = df[ |
|
[ |
|
"Rank", |
|
"Model", |
|
"Overall BLEU", |
|
"High-Resource BLEU", |
|
"Mid-Resource BLEU", |
|
"Low-Resource BLEU", |
|
"Languages Tested", |
|
] |
|
] |
|
|
|
return gr.DataFrame( |
|
value=df, |
|
label="Model Leaderboard", |
|
show_search=False, |
|
datatype=[ |
|
"number", |
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
], |
|
) |
|
|
|
|
|
def create_model_comparison_plot(results): |
|
|
|
models = set() |
|
for lang in results: |
|
for score in lang["scores"]: |
|
models.add(score["model"]) |
|
models = list(models) |
|
|
|
|
|
traces = [] |
|
for model in models: |
|
x_vals = [] |
|
y_vals = [] |
|
|
|
for lang in results: |
|
model_score = next( |
|
(s["bleu"] for s in lang["scores"] if s["model"] == model), None |
|
) |
|
if model_score is not None: |
|
x_vals.append(lang["language_name"]) |
|
y_vals.append(model_score) |
|
|
|
traces.append( |
|
go.Bar( |
|
name=model.split("/")[-1], |
|
x=x_vals, |
|
y=y_vals, |
|
) |
|
) |
|
|
|
fig = go.Figure(data=traces) |
|
fig.update_layout( |
|
title="BLEU Scores by Model and Language", |
|
xaxis_title="Language", |
|
yaxis_title="BLEU Score", |
|
barmode="group", |
|
height=500, |
|
) |
|
return fig |
|
|
|
|
|
def create_language_stats_df(results): |
|
|
|
flat_data = [] |
|
|
|
for lang in results: |
|
|
|
best_score = max( |
|
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"] |
|
) |
|
|
|
model = best_score['model'] |
|
model_name = model.split('/')[-1] if model else "N/A" |
|
model_link = f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" if model else "N/A" |
|
row = { |
|
"Language": f"**{lang['language_name']}**", |
|
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1), |
|
"Models Tested": len(lang["scores"]), |
|
"Average BLEU": round(lang["bleu"], 3) |
|
if lang["bleu"] is not None |
|
else "N/A", |
|
"Best Model": model_link, |
|
"Best Model BLEU": round(best_score["bleu"], 3) |
|
if best_score["bleu"] is not None |
|
else "N/A", |
|
"CommonVoice Hours": lang["commonvoice_hours"], |
|
} |
|
flat_data.append(row) |
|
|
|
df = pd.DataFrame(flat_data) |
|
return gr.DataFrame( |
|
value=df, |
|
label="Language Results", |
|
show_search="search", |
|
datatype=["markdown", "number", "number", "number", "markdown", "number"], |
|
) |
|
|
|
|
|
def create_scatter_plot(results): |
|
fig = go.Figure() |
|
|
|
x_vals = [lang["speakers"] / 1_000_000 for lang in results] |
|
y_vals = [lang["bleu"] for lang in results] |
|
labels = [lang["language_name"] for lang in results] |
|
|
|
fig.add_trace( |
|
go.Scatter( |
|
x=x_vals, |
|
y=y_vals, |
|
mode="markers+text", |
|
text=labels, |
|
textposition="top center", |
|
hovertemplate="<b>%{text}</b><br>" |
|
+ "Speakers: %{x:.1f}M<br>" |
|
+ "BLEU Score: %{y:.3f}<extra></extra>", |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title="Language Coverage: Speakers vs BLEU Score", |
|
xaxis_title="Number of Speakers (Millions)", |
|
yaxis_title="Average BLEU Score", |
|
height=500, |
|
showlegend=False, |
|
) |
|
|
|
|
|
fig.update_xaxes(type="log") |
|
|
|
return fig |
|
|
|
|
|
|
|
with gr.Blocks(title="AI Language Translation Benchmark") as demo: |
|
gr.Markdown("# AI Language Translation Benchmark") |
|
gr.Markdown( |
|
"Comparing translation performance across different AI models and languages" |
|
) |
|
|
|
bar_plot = create_model_comparison_plot(results) |
|
scatter_plot = create_scatter_plot(results) |
|
|
|
create_leaderboard_df(results) |
|
gr.Plot(value=bar_plot, label="Model Comparison") |
|
create_language_stats_df(results) |
|
gr.Plot(value=scatter_plot, label="Language Coverage") |
|
|
|
gr.Markdown( |
|
""" |
|
## Methodology |
|
### Dataset |
|
- Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages |
|
- Each language is tested with the same 100 sentences |
|
- All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers |
|
- Language statistics sourced from Ethnologue and Wikidata |
|
|
|
### Models & Evaluation |
|
- Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed |
|
- **BLEU Score**: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better |
|
|
|
### Language Categories |
|
Languages are divided into three tiers based on translation difficulty: |
|
- High-Resource: Top 25% of languages by BLEU score (easiest to translate) |
|
- Mid-Resource: Middle 50% of languages |
|
- Low-Resource: Bottom 25% of languages (hardest to translate) |
|
""", |
|
container=True, |
|
) |
|
|
|
demo.launch() |
|
|