|
import json |
|
from functools import partial |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
import pycountry |
|
from gradio_rangeslider import RangeSlider |
|
from tqdm import tqdm |
|
|
|
with open("results.json") as f: |
|
languages = json.load(f) |
|
|
|
languages_with_scores = [lang for lang in languages if lang["t2t_score"] is not None] |
|
|
|
|
|
METRICS = { |
|
"t2t": [ |
|
{ |
|
"display_name": "Overall Text-to-Text Performance", |
|
"field_name": "t2t_score", |
|
"label": "Overall Score", |
|
"explanation": """ |
|
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks. |
|
Higher scores indicate better overall language capabilities. |
|
""", |
|
}, |
|
{ |
|
"display_name": "Translation (BLEU)", |
|
"field_name": "mt_bleu", |
|
"label": "BLEU Score", |
|
"explanation": """ |
|
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations. |
|
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality. |
|
""", |
|
}, |
|
{ |
|
"display_name": "Translation (ChrF)", |
|
"field_name": "mt_chrf", |
|
"label": "ChrF Score", |
|
"explanation": """ |
|
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level. |
|
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches. |
|
Higher scores (0-1) indicate better translations. |
|
""", |
|
}, |
|
{ |
|
"display_name": "Classification (Accuracy)", |
|
"field_name": "cls_acc", |
|
"label": "Classification Accuracy", |
|
"explanation": """ |
|
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories. |
|
This evaluates a model's understanding of content and context across different languages. |
|
Reported as a percentage where higher values indicate better classification performance. |
|
""", |
|
}, |
|
{ |
|
"display_name": "Masked Language Modeling (ChrF)", |
|
"field_name": "mlm_chrf", |
|
"label": "MLM ChrF Score", |
|
"explanation": """ |
|
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text. |
|
This tests a model's understanding of language structure and semantics by measuring the character-level similarity |
|
between predicted and actual text. Higher scores indicate better language understanding. |
|
""", |
|
}, |
|
], |
|
"s2t": [ |
|
{ |
|
"display_name": "Overall Speech-to-Text Performance", |
|
"field_name": "s2t_score", |
|
"label": "Overall Score", |
|
"explanation": """ |
|
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks. |
|
Higher scores indicate better overall language capabilities. |
|
""", |
|
}, |
|
{ |
|
"display_name": "Automatic Speech Recognition (WER)", |
|
"field_name": "asr_wer", |
|
"label": "WER", |
|
"explanation": """ |
|
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription. |
|
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the |
|
transcription into the reference text, divided by the number of words in the reference. |
|
Lower scores indicate better performance, with 0 being perfect transcription. |
|
""", |
|
}, |
|
{ |
|
"display_name": "Automatic Speech Recognition (ChrF)", |
|
"field_name": "asr_chrf", |
|
"label": "ChrF", |
|
"explanation": """ |
|
**Automatic Speech Recognition ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level. |
|
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches. |
|
Higher scores (0-1) indicate better translations. |
|
""", |
|
}, |
|
], |
|
} |
|
|
|
|
|
def mean(lst): |
|
return sum(lst) / len(lst) |
|
|
|
|
|
def create_leaderboard_df(model_type, metric=None): |
|
metric = metric or METRICS[model_type][0] |
|
_model_type = {"t2t": "text-to-text", "s2t": "speech-to-text"}[model_type] |
|
models = { |
|
score["model"] |
|
for lang in languages_with_scores |
|
for score in lang["scores"] |
|
if score["model_type"] == _model_type |
|
} |
|
model_scores = [ |
|
{"model": score["model"], metric["field_name"]: score[metric["field_name"]]} |
|
for lang in languages_with_scores |
|
for score in lang["scores"] |
|
for model in models |
|
if score["model"] == model |
|
] |
|
df = ( |
|
pd.DataFrame(model_scores) |
|
.groupby("model") |
|
.agg({metric["field_name"]: ["mean", "count"]}) |
|
.reset_index() |
|
) |
|
|
|
df.columns = df.columns.map( |
|
lambda x: f"{x[0]}_{x[1]}" if isinstance(x, tuple) else x |
|
) |
|
df = df.rename( |
|
columns={ |
|
f"{metric['field_name']}_mean": metric["label"], |
|
f"{metric['field_name']}_count": "Languages Tested", |
|
"model_": "Model", |
|
} |
|
) |
|
df[metric["label"]] = df[metric["label"]].round(3) |
|
df = df.sort_values(metric["label"], ascending=False) |
|
df["Rank"] = range(1, len(df) + 1) |
|
df["Rank"] = df["Rank"].apply( |
|
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x) |
|
) |
|
df = df[["Rank", "Model", metric["label"]]] |
|
return gr.DataFrame( |
|
value=df, |
|
label="Model Leaderboard", |
|
show_search=False, |
|
datatype=["number", "markdown", "number"], |
|
) |
|
|
|
|
|
def create_model_comparison_plot(metric): |
|
top_languages = sorted( |
|
languages_with_scores, key=lambda x: x["speakers"], reverse=True |
|
)[:10] |
|
|
|
|
|
title = f"{metric['display_name']} by Model and Language" |
|
y_label = metric["label"] |
|
|
|
|
|
scores_flat = [] |
|
for lang in top_languages: |
|
for score in lang["scores"]: |
|
|
|
if metric["field_name"] not in score: |
|
continue |
|
value = score[metric["field_name"]] |
|
if value is not None: |
|
scores_flat.append( |
|
{ |
|
"language": lang["language_name"], |
|
"model": score["model"], |
|
"value": value, |
|
} |
|
) |
|
|
|
df = pd.DataFrame(scores_flat) |
|
fig = px.bar(df, x="language", y="value", color="model", barmode="group") |
|
fig.update_layout( |
|
title=title, |
|
xaxis_title=None, |
|
yaxis_title=y_label, |
|
barmode="group", |
|
height=500, |
|
legend=dict( |
|
orientation="h", |
|
yanchor="bottom", |
|
y=-0.3, |
|
xanchor="center", |
|
x=0.5, |
|
), |
|
) |
|
return fig |
|
|
|
|
|
def create_language_stats_df(metric): |
|
|
|
flat_data = [] |
|
|
|
for lang in languages: |
|
|
|
best_model = ( |
|
max( |
|
lang["scores"] or [{"t2t_score": None, "model": None}], |
|
key=lambda x: x.get("t2t_score", 0), |
|
) |
|
if lang["t2t_score"] is not None |
|
else None |
|
) |
|
|
|
model = best_model["model"] if best_model else None |
|
model_name = model.split("/")[-1] if model else "N/A" |
|
model_link = ( |
|
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" |
|
if model |
|
else "N/A" |
|
) |
|
commonvoice_link = ( |
|
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {round(lang['commonvoice_hours'])}h</a>" |
|
if lang["commonvoice_hours"] |
|
else "N/A" |
|
) |
|
language_link = f"<a href='/{lang['bcp_47']}' style='text-decoration: none; font-weight: bold;'>{lang['language_name']}</a>" |
|
|
|
row = { |
|
"Language": language_link, |
|
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1), |
|
|
|
|
|
|
|
|
|
"Best Model": model_link, |
|
"Translation": round(lang["mt_chrf"], 3) |
|
if lang["mt_chrf"] is not None |
|
else "N/A", |
|
"Classification": round(lang["cls_acc"], 3) |
|
if lang["cls_acc"] is not None |
|
else "N/A", |
|
"Masked Language Modeling": round(lang["mlm_chrf"], 3) |
|
if lang["mlm_chrf"] is not None |
|
else "N/A", |
|
"Speech Recognition": round(lang["asr_chrf"], 3) if lang["asr_wer"] is not None else "N/A", |
|
"CommonVoice": commonvoice_link, |
|
} |
|
flat_data.append(row) |
|
|
|
df = pd.DataFrame(flat_data) |
|
return gr.DataFrame( |
|
value=df, |
|
label="Language Results", |
|
show_search="search", |
|
pinned_columns=1, |
|
column_widths=[ |
|
"100px", |
|
"100px", |
|
|
|
|
|
"200px", |
|
"100px", |
|
"100px", |
|
"100px", |
|
"100px", |
|
"100px", |
|
], |
|
datatype=[ |
|
"markdown", |
|
"number", |
|
|
|
|
|
"markdown", |
|
"number", |
|
"number", |
|
"number", |
|
"number", |
|
"markdown", |
|
], |
|
) |
|
|
|
|
|
def create_scatter_plot(metric): |
|
|
|
scatter_data = [] |
|
for lang in languages_with_scores: |
|
if lang["speakers"] < 100_000: |
|
continue |
|
|
|
scores = [ |
|
score[metric["field_name"]] |
|
for score in lang["scores"] |
|
if metric["field_name"] in score and score[metric["field_name"]] is not None |
|
] |
|
if scores: |
|
avg_score = sum(scores) / len(scores) |
|
scatter_data.append( |
|
{ |
|
"language": lang["language_name"], |
|
"speakers": lang["speakers"], |
|
"score": avg_score, |
|
"family": lang["language_family"], |
|
} |
|
) |
|
|
|
fig = go.Figure() |
|
x_vals = [data["speakers"] / 1_000_000 for data in scatter_data] |
|
y_vals = [data["score"] for data in scatter_data] |
|
s_vals = [data["speakers"] / 20_000_000 for data in scatter_data] |
|
color_pallette = [ |
|
"LightSkyBlue", |
|
"LightGreen", |
|
"LightCoral", |
|
"LightPink", |
|
"LightGoldenRodYellow", |
|
"LightGray", |
|
"LightSalmon", |
|
"LightSeaGreen", |
|
] |
|
color_mapping = { |
|
family: color |
|
for family, color in zip( |
|
sorted(set(data["family"] for data in scatter_data)), color_pallette |
|
) |
|
} |
|
c_vals = [color_mapping.get(data["family"], "LightGray") for data in scatter_data] |
|
labels = [data["language"] for data in scatter_data] |
|
hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>" |
|
fig.add_trace( |
|
go.Scatter( |
|
x=x_vals, |
|
y=y_vals, |
|
marker=dict(size=s_vals, color=c_vals), |
|
mode="markers+text", |
|
text=labels, |
|
textposition="top center", |
|
hovertemplate=hover_template, |
|
) |
|
) |
|
fig.update_layout( |
|
title=None, |
|
xaxis_title="Number of Speakers (Millions)", |
|
yaxis_title=metric["label"], |
|
height=500, |
|
showlegend=False, |
|
) |
|
fig.update_xaxes(type="log") |
|
return fig |
|
|
|
|
|
def format_number(n): |
|
"""Format number with K/M suffix""" |
|
if n >= 1_000_000: |
|
return f"{n/1_000_000:.1f}M" |
|
elif n >= 1_000: |
|
return f"{n/1_000:.0f}K" |
|
return str(n) |
|
|
|
|
|
def get_population_data(): |
|
import xml.etree.ElementTree as ET |
|
|
|
from language_data.util import data_filename |
|
|
|
filename = data_filename("supplementalData.xml") |
|
root = ET.fromstring(open(filename).read()) |
|
territories = root.findall("./territoryInfo/territory") |
|
|
|
data = {} |
|
for territory in territories: |
|
t_code = territory.attrib["type"] |
|
t_population = float(territory.attrib["population"]) |
|
data[t_code] = t_population |
|
return data |
|
|
|
|
|
|
|
def make_black_bar(value, max_width=10): |
|
filled = int(value * max_width) |
|
return "⬛️" * filled + "⬜️" * (max_width - filled) |
|
|
|
|
|
def make_colored_bar(score, max_width=10): |
|
"""Create a colored bar using Unicode blocks based on normalized score |
|
🟦 for high values (>0.35) |
|
🟨 for medium values (0.25-0.35) |
|
🟥 for low values (<0.25) |
|
⬜ for empty space |
|
|
|
This function handles both normalization and bar creation. |
|
""" |
|
|
|
|
|
filled = int(score * max_width) |
|
filled = max(0, min(filled, max_width)) |
|
empty = max_width - filled |
|
|
|
if score > 0.35: |
|
return "🟦" * filled + "⬜" * empty |
|
elif score > 0.25: |
|
return "🟨" * filled + "⬜" * empty |
|
else: |
|
return "🟥" * filled + "⬜" * empty |
|
|
|
|
|
def create_world_map(metric): |
|
|
|
population_data = get_population_data() |
|
country_data = {} |
|
for lang in languages: |
|
|
|
if "population" not in lang or lang[metric["field_name"]] is None: |
|
continue |
|
|
|
for country_code, speakers in lang["population"].items(): |
|
try: |
|
|
|
country = pycountry.countries.get(alpha_2=country_code) |
|
if country is None: |
|
continue |
|
|
|
iso3_code = country.alpha_3 |
|
if iso3_code not in country_data: |
|
country_data[iso3_code] = { |
|
"total_speakers": 0, |
|
"population": population_data.get(country_code, 0), |
|
"weighted_score_sum": 0, |
|
"languages": [], |
|
} |
|
|
|
country_data[iso3_code]["total_speakers"] += speakers |
|
country_data[iso3_code]["weighted_score_sum"] += ( |
|
speakers * lang[metric["field_name"]] |
|
) |
|
country_data[iso3_code]["languages"].append( |
|
{ |
|
"name": lang["language_name"], |
|
"speakers": speakers, |
|
"score": lang[metric["field_name"]], |
|
} |
|
) |
|
except (KeyError, AttributeError): |
|
|
|
continue |
|
|
|
|
|
countries = [] |
|
scores = [] |
|
hover_texts = [] |
|
|
|
for country_code, data in country_data.items(): |
|
weighted_avg = data["weighted_score_sum"] / data["total_speakers"] if data["total_speakers"] > 0 else None |
|
|
|
try: |
|
country_name = pycountry.countries.get(alpha_3=country_code).name |
|
except AttributeError: |
|
country_name = country_code |
|
|
|
|
|
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True) |
|
|
|
|
|
main_langs = langs[:5] |
|
other_langs = langs[5:] |
|
|
|
|
|
lang_rows = [] |
|
for lang in main_langs: |
|
percentage = (lang["speakers"] / data["population"]) * 100 |
|
speaker_bar = make_black_bar(percentage / 100) |
|
|
|
|
|
score_bar = make_colored_bar(lang["score"]) |
|
|
|
lang_rows.append( |
|
f"<b>{lang['name']}</b><br>" |
|
f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>" |
|
f"{score_bar} {lang['score']:.3f} {metric['label']}<br>" |
|
) |
|
|
|
|
|
if other_langs: |
|
other_speakers = sum(lang["speakers"] for lang in other_langs) |
|
other_percentage = (other_speakers / data["population"]) * 100 |
|
other_avg_score = sum(lang["score"] for lang in other_langs) / len( |
|
other_langs |
|
) |
|
|
|
speaker_bar = make_black_bar(other_percentage / 100) |
|
|
|
|
|
score_bar = make_colored_bar(other_avg_score) |
|
|
|
lang_rows.append( |
|
f"<b>+{len(other_langs)} other languages</b><br>" |
|
f"{speaker_bar} {format_number(other_speakers)} speakers<br>" |
|
f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>" |
|
) |
|
|
|
hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}" |
|
|
|
countries.append(country_code) |
|
scores.append(weighted_avg) |
|
hover_texts.append(hover_text) |
|
|
|
fig = go.Figure( |
|
data=go.Choropleth( |
|
locations=countries, |
|
locationmode="ISO-3", |
|
z=scores, |
|
text=hover_texts, |
|
hoverinfo="text", |
|
colorscale=[[0, "#ff9999"], [1, "#99ccff"]], |
|
colorbar=dict( |
|
title=metric["label"], |
|
orientation="h", |
|
y=-0.2, |
|
yanchor="bottom", |
|
len=0.5, |
|
x=0.5, |
|
xanchor="center", |
|
thickness=20, |
|
), |
|
) |
|
) |
|
|
|
fig.update_layout( |
|
title=dict( |
|
text=f"{metric['display_name']} by Country", x=0.5, xanchor="center" |
|
), |
|
geo=dict( |
|
showframe=True, |
|
showcoastlines=True, |
|
projection_type="equal earth", |
|
showland=True, |
|
landcolor="#f8f9fa", |
|
coastlinecolor="#e0e0e0", |
|
countrycolor="#e0e0e0", |
|
), |
|
height=600, |
|
margin=dict(l=0, r=0, t=30, b=0), |
|
paper_bgcolor="white", |
|
hoverlabel=dict( |
|
bgcolor="beige", |
|
font_size=12, |
|
), |
|
) |
|
|
|
return fig |
|
|
|
|
|
def create_metric_selector(model_type): |
|
match model_type: |
|
case "t2t": |
|
choices = [m["display_name"] for m in METRICS["t2t"]] |
|
case "s2t": |
|
choices = [m["display_name"] for m in METRICS["s2t"]] |
|
return gr.Dropdown( |
|
choices=choices, value=choices[0], label="Select Metric", interactive=True |
|
) |
|
|
|
|
|
def create_metric_explanation(metric): |
|
return gr.Markdown(metric["explanation"], container=True) |
|
|
|
css=""" |
|
.radio-group .wrap { |
|
display: grid !important; |
|
grid-template-columns: 1fr 1fr; |
|
} |
|
|
|
.nav-holder {display: none;} |
|
|
|
.share-link { |
|
display: inline-flex; |
|
align-items: center; |
|
background-color: #f0f0f0; |
|
border-radius: 8px; |
|
padding: 8px 12px; |
|
margin: 10px 0; |
|
font-family: monospace; |
|
transition: all 0.2s ease; |
|
cursor: pointer; |
|
text-decoration: none; |
|
color: #333; |
|
} |
|
|
|
.share-link:hover { |
|
background-color: #e0e0e0; |
|
} |
|
|
|
.share-link .icon { |
|
margin-left: 8px; |
|
} |
|
|
|
.title-row { |
|
display: flex; |
|
align-items: center; |
|
justify-content: space-between; |
|
margin-bottom: 1rem; |
|
} |
|
|
|
.title-row h2 { |
|
margin: 0; |
|
} |
|
""" |
|
|
|
|
|
shortcut_js = """ |
|
<script> |
|
// Handle URL parameters for direct language access |
|
const params = new URLSearchParams(window.location.search); |
|
const lang = params.get("lang"); |
|
|
|
if (lang) { |
|
window.location.href = "/" + lang; |
|
} |
|
|
|
// Function to copy link to clipboard |
|
const copyLinkToClipboard = (link) => { |
|
navigator.clipboard.writeText(link); |
|
console.log("Copied link to clipboard: " + link); |
|
} |
|
|
|
const redirect_to_lang = lang_descriptor => { |
|
lang_code = lang_descriptor.split("(")[1].split(")")[0]; |
|
console.log("redirecting to /" + lang_code); |
|
window.location.href = "/" + lang_code; |
|
} |
|
|
|
const empty_search = () => { |
|
console.log("empty search"); |
|
document.getElementById("search-dropdown").value = ""; |
|
} |
|
</script> |
|
""" |
|
|
|
|
|
|
|
with gr.Blocks(title="AI Language Proficiency Benchmark", css=css, head=shortcut_js) as demo: |
|
language_choices = [ |
|
f"{lang['language_name']} ({lang['bcp_47']})" for lang in languages |
|
] |
|
models = {score["model"] for lang in languages for score in lang["scores"]} |
|
search = gr.Dropdown( |
|
choices=language_choices, |
|
value="Search for Language or Model", |
|
allow_custom_value=True, |
|
interactive=True, |
|
container=False, |
|
elem_id="search-dropdown" |
|
) |
|
search.focus(fn=lambda x: None, inputs=search, outputs=None, js="(x) => {empty_search()}") |
|
search.change(fn=lambda x: None, inputs=search, outputs=None, js="(x) => {redirect_to_lang(x)}") |
|
gr.Markdown("# AI Language Proficiency Benchmark") |
|
gr.Markdown("Comparing language proficiency across different models and languages.") |
|
with gr.Row(): |
|
start_model_type = "Text-to-Text" |
|
model_type = gr.Radio( |
|
choices=["Text-to-Text", "Speech-to-Text"], |
|
value=start_model_type, |
|
label="Select Model Type", |
|
interactive=True, |
|
elem_classes="radio-group", |
|
) |
|
start_metric = METRICS["t2t"][0] |
|
metric = gr.Dropdown( |
|
choices=[metric["display_name"] for metric in METRICS["t2t"]], |
|
value=start_metric["display_name"], |
|
label="Main task and metric to display in figures and map", |
|
interactive=True, |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("Model Filters", open=False): |
|
model_licenses = gr.CheckboxGroup( |
|
choices=["open source", "commercial"], |
|
value=["open source", "commercial"], |
|
label="Filter by Model License", |
|
interactive=True, |
|
) |
|
model_sizes = RangeSlider( |
|
minimum=0, |
|
maximum=1000, |
|
value=(0, 1000), |
|
label="Filter by Model Size (in Billion Parameters)", |
|
interactive=True, |
|
) |
|
with gr.Column(): |
|
with gr.Accordion("Language Filters", open=False): |
|
unit_of_analysis = gr.Radio( |
|
choices=["Languages", "Language Families", "Regions"], |
|
value="Languages", |
|
label="Select Unit of Analysis", |
|
interactive=True, |
|
) |
|
family_filter = gr.CheckboxGroup( |
|
choices=[ |
|
"Indo-European", |
|
"Sino-Tibetan", |
|
"Afro-Asiatic", |
|
"Dravidian", |
|
"Uralic", |
|
"Austronesian", |
|
"Other", |
|
], |
|
value=[ |
|
"Indo-European", |
|
"Sino-Tibetan", |
|
"Afro-Asiatic", |
|
"Dravidian", |
|
"Uralic", |
|
"Austronesian", |
|
"Other", |
|
], |
|
label="Filter by Language Family", |
|
interactive=True, |
|
) |
|
speakers_filter = RangeSlider( |
|
minimum=0, |
|
maximum=100_000_000, |
|
value=(0, 100_000_000), |
|
label="Filter by Number of Speakers", |
|
interactive=True, |
|
) |
|
|
|
gr.Markdown("## Model Comparison") |
|
leaderboard_df = create_leaderboard_df("t2t", start_metric) |
|
|
|
model_comparison_plot = gr.Plot( |
|
value=create_model_comparison_plot(start_metric), |
|
label="Model Comparison", |
|
) |
|
|
|
gr.Markdown("## Language Stats") |
|
create_language_stats_df(start_metric) |
|
scatter_plot = gr.Plot( |
|
value=create_scatter_plot(start_metric), |
|
label="Speaker Population vs. Metric", |
|
) |
|
world_map = gr.Plot( |
|
value=create_world_map(start_metric), |
|
label="World Map", |
|
container=False, |
|
elem_classes="fullwidth-plot", |
|
) |
|
|
|
def update_model_type(model_type_choice): |
|
model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice] |
|
return create_metric_selector(model_type), create_leaderboard_df(model_type) |
|
|
|
model_type.change( |
|
fn=update_model_type, |
|
inputs=model_type, |
|
outputs=[metric, leaderboard_df], |
|
) |
|
|
|
def update_component(fn, model_type_choice, metric_choice): |
|
model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice] |
|
metric = [m for m in METRICS[model_type] if m["display_name"] == metric_choice][ |
|
0 |
|
] |
|
return fn(metric) |
|
metric.change( |
|
fn=partial(update_component, create_model_comparison_plot), |
|
inputs=[model_type, metric], |
|
outputs=model_comparison_plot, |
|
) |
|
metric.change( |
|
fn=partial(update_component, create_scatter_plot), |
|
inputs=[model_type, metric], |
|
outputs=scatter_plot, |
|
) |
|
metric.change( |
|
fn=partial(update_component, create_world_map), |
|
inputs=[model_type, metric], |
|
outputs=world_map, |
|
) |
|
|
|
with gr.Accordion("Methodology", open=False): |
|
gr.Markdown( |
|
""" |
|
### Benchmark Data |
|
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one. |
|
|
|
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package. |
|
|
|
### AI Models |
|
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API. |
|
|
|
### Evaluation Tasks |
|
Our benchmark includes three core tasks to assess different aspects of language understanding: |
|
|
|
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using: |
|
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty |
|
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations |
|
|
|
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We: |
|
- Group sentences by URL into paragraphs with the same topic |
|
- Use the 5 most common topics, encoded as numbers rather than English labels |
|
- Provide 5 examples of each topic as few-shot examples |
|
- Test the model's ability to classify new text |
|
- Report accuracy as the primary metric |
|
|
|
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We: |
|
- Mask approximately 5% of each sentence at a random position |
|
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting |
|
- Evaluate predictions using ChrF score against the original text |
|
|
|
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages. |
|
""" |
|
) |
|
|
|
|
|
for lang in tqdm(languages[:20], desc="Generating pages"): |
|
with demo.route(lang['language_name'], f"/{lang['bcp_47']}"): |
|
gr.Button("← Back to Main Dashboard", link="/") |
|
url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}" |
|
gr.Markdown( |
|
f''' |
|
<div class="title-row"> |
|
<h2>{lang['language_name']}</h2> |
|
<div class="share-link" onclick="copyLinkToClipboard('{url}')">{url}<span class="icon">📋</span></div> |
|
</div> |
|
''', |
|
sanitize_html=False |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
gr.Markdown(f""" |
|
## Language Overview |
|
- **Native name**: {lang.get('native_name', 'N/A')} |
|
- **Language family**: {lang.get('language_family', 'N/A')} |
|
- **BCP-47 code**: `{lang['bcp_47']}` |
|
- **ISO 639-3 code**: `{lang.get('iso_639_3', 'N/A')}` |
|
- **Number of speakers**: {format_number(lang['speakers'])} |
|
- **Script**: {lang.get('script', 'N/A')} |
|
- **CommonVoice hours**: {round(lang.get('commonvoice_hours', 0) or 0)} |
|
""") |
|
|
|
|
|
resource_links = [] |
|
if lang.get('commonvoice_locale'): |
|
resource_links.append(f"[CommonVoice Dataset](https://commonvoice.mozilla.org/{lang['commonvoice_locale']})") |
|
if lang.get('wikipedia_code'): |
|
resource_links.append(f"[Wikipedia](https://{lang['wikipedia_code']}.wikipedia.org)") |
|
if lang.get('bcp_47'): |
|
resource_links.append(f"[FLORES+ Dataset](https://huggingface.co/datasets/openlanguagedata/flores_plus/viewer/all/{lang['bcp_47']})") |
|
|
|
if resource_links: |
|
gr.Markdown("### Resources\n" + "\n".join(resource_links)) |
|
|
|
with gr.Column(scale=3): |
|
|
|
country_data = {} |
|
if "population" in lang: |
|
for country_code, speakers in lang["population"].items(): |
|
try: |
|
country = pycountry.countries.get(alpha_2=country_code) |
|
if country: |
|
country_data[country.alpha_3] = speakers / lang["speakers"] |
|
except (KeyError, AttributeError): |
|
continue |
|
|
|
locations = list(country_data.keys()) |
|
values = list(country_data.values()) |
|
|
|
if locations: |
|
fig = go.Figure(data=go.Choropleth( |
|
locations=locations, |
|
z=values, |
|
locationmode="ISO-3", |
|
colorscale="Blues", |
|
marker_line_color='white', |
|
marker_line_width=0.5, |
|
colorbar_title="Speaker %" |
|
)) |
|
|
|
fig.update_layout( |
|
title_text=f"Distribution of {lang['language_name']} Speakers", |
|
geo=dict( |
|
showframe=False, |
|
showcoastlines=True, |
|
projection_type='natural earth' |
|
), |
|
height=300, |
|
margin={"r":0,"t":30,"l":0,"b":0} |
|
) |
|
|
|
gr.Plot(value=fig) |
|
else: |
|
gr.Markdown("*Geographic data not available*") |
|
|
|
|
|
gr.Markdown("## AI Model Performance") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
metrics_data = [] |
|
for metric_key, display_name in [ |
|
("t2t_score", "Overall Text Performance"), |
|
("mt_bleu", "Translation (BLEU)"), |
|
("mt_chrf", "Translation (ChrF)"), |
|
("cls_acc", "Classification"), |
|
("mlm_chrf", "Masked Language Modeling"), |
|
("s2t_score", "Overall Speech Performance"), |
|
("asr_wer", "Speech Recognition (WER)"), |
|
("asr_chrf", "Speech Recognition (ChrF)") |
|
]: |
|
if metric_key in lang and lang[metric_key] is not None: |
|
value = lang[metric_key] |
|
color = "green" if value > 0.5 else "orange" if value > 0.25 else "red" |
|
|
|
|
|
if metric_key == "asr_wer": |
|
color = "green" if value < 0.3 else "orange" if value < 0.6 else "red" |
|
|
|
metrics_data.append({ |
|
"Metric": display_name, |
|
"Value": round(value, 3), |
|
"Visual": make_colored_bar(value if metric_key != "asr_wer" else 1 - value) |
|
}) |
|
|
|
if metrics_data: |
|
gr.DataFrame( |
|
pd.DataFrame(metrics_data), |
|
label=f"Performance Metrics for {lang['language_name']}", |
|
show_search=False |
|
) |
|
else: |
|
gr.Markdown("*No performance metrics available*") |
|
|
|
|
|
gr.Markdown("## Model Comparison") |
|
|
|
with gr.Row(): |
|
models_data = [] |
|
for score in lang["scores"]: |
|
if score.get("t2t_score") is not None: |
|
model_name = score["model"].split("/")[-1] |
|
models_data.append({ |
|
"Model": model_name, |
|
"Overall": round(score.get("t2t_score", 0), 3), |
|
"Translation": round(score.get("mt_chrf", 0), 3), |
|
"Classification": round(score.get("cls_acc", 0), 3), |
|
"Lang Model": round(score.get("mlm_chrf", 0), 3), |
|
"Speech": round(score.get("asr_chrf", 0), 3) if "asr_chrf" in score else "N/A" |
|
}) |
|
|
|
if models_data: |
|
df = pd.DataFrame(models_data).sort_values("Overall", ascending=False) |
|
gr.DataFrame( |
|
df, |
|
label=f"Model Performance on {lang['language_name']}", |
|
show_search=False |
|
) |
|
else: |
|
gr.Markdown("*No model comparison data available*") |
|
|
|
|
|
if lang.get("language_family"): |
|
gr.Markdown("## Comparison with Related Languages") |
|
|
|
|
|
related_langs = [l for l in languages if l.get("language_family") == lang["language_family"] and l["t2t_score"] is not None] |
|
related_langs = sorted(related_langs, key=lambda x: x["t2t_score"], reverse=True)[:10] |
|
|
|
if len(related_langs) > 1: |
|
lang_names = [l["language_name"] for l in related_langs] |
|
t2t_scores = [l["t2t_score"] for l in related_langs] |
|
|
|
fig = px.bar( |
|
x=lang_names, |
|
y=t2t_scores, |
|
labels={"x": "Language", "y": "Text-to-Text Score"}, |
|
title=f"Performance Across {lang['language_family']} Languages" |
|
) |
|
|
|
|
|
for i, name in enumerate(lang_names): |
|
if name == lang["language_name"]: |
|
fig.data[0].marker.color = ["lightblue"] * i + ["orange"] + ["lightblue"] * (len(lang_names) - i - 1) |
|
|
|
fig.update_layout(height=400) |
|
gr.Plot(value=fig) |
|
|
|
|
|
demo.launch() |
|
|