Spaces:

fair-forward
/

evals-for-every-language

Running

David Pomerenke

Metrics selector & refactoring

4f572a5 2 months ago

21.6 kB

	import json

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import pycountry

	with open("results.json") as f:
	results = json.load(f)

	# Global constants for metric mappings
	METRICS = {
	"overall_performance": {
	"display_name": "Overall Performance",
	"field_name": "overall_score",
	"label": "Overall Performance Score",
	"explanation": """
	Overall Performance: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
	Higher scores indicate better overall language capabilities.
	""",
	},
	"translation_bleu": {
	"display_name": "Translation (BLEU)",
	"field_name": "mt_bleu",
	"label": "BLEU Score",
	"explanation": """
	Translation BLEU: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
	It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
	""",
	},
	"translation_chrf": {
	"display_name": "Translation (ChrF)",
	"field_name": "mt_chrf",
	"label": "ChrF Score",
	"explanation": """
	Translation ChrF: Character n-gram F-score evaluates translations at the character level rather than word level.
	This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
	Higher scores (0-1) indicate better translations.
	""",
	},
	"classification_accuracy": {
	"display_name": "Classification (Accuracy)",
	"field_name": "cls_acc",
	"label": "Classification Accuracy",
	"explanation": """
	Classification Accuracy: Measures how accurately models can classify text into predefined categories.
	This evaluates a model's understanding of content and context across different languages.
	Reported as a percentage where higher values indicate better classification performance.
	""",
	},
	"mlm_chrf": {
	"display_name": "Masked Language Modeling (ChrF)",
	"field_name": "mlm_chrf",
	"label": "MLM ChrF Score",
	"explanation": """
	Masked Language Modeling ChrF: Evaluates how well models can predict masked (hidden) portions of text.
	This tests a model's understanding of language structure and semantics by measuring the character-level similarity
	between predicted and actual text. Higher scores indicate better language understanding.
	""",
	},
	}


	def mean(lst):
	return sum(lst) / len(lst)


	def create_leaderboard_df(metric):
	# Sort languages by average BLEU to determine resource categories
	langs_with_score = [lang for lang in results if lang[metric['field_name']] is not None]
	sorted_langs = sorted(langs_with_score, key=lambda x: x[metric['field_name']], reverse=True)
	n_langs = len(sorted_langs)
	high_cutoff = n_langs // 4 # top 25%
	low_cutoff = n_langs - n_langs // 4 # bottom 25%

	# Create sets of languages for each category
	high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
	low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}

	# Get all model scores with categorization
	model_scores = {}
	for lang in results:
	category = (
	"High-Resource"
	if lang["language_name"] in high_resource
	else "Low-Resource"
	if lang["language_name"] in low_resource
	else "Mid-Resource"
	)

	for score in lang["scores"]:
	model = score["model"]
	if model not in model_scores:
	model_scores[model] = {
	"High-Resource": [],
	"Mid-Resource": [],
	"Low-Resource": [],
	}
	model_scores[model][category].append(score[metric['field_name']])

	# Calculate average scores and create DataFrame
	leaderboard_data = []
	for model, categories in model_scores.items():
	# Calculate averages for each category
	high_avg = (
	round(mean(categories["High-Resource"]), 3)
	if categories["High-Resource"]
	else 0
	)
	mid_avg = (
	round(mean(categories["Mid-Resource"]), 3)
	if categories["Mid-Resource"]
	else 0
	)
	low_avg = (
	round(mean(categories["Low-Resource"]), 3)
	if categories["Low-Resource"]
	else 0
	)

	# Calculate overall average
	all_scores = (
	categories["High-Resource"]
	+ categories["Mid-Resource"]
	+ categories["Low-Resource"]
	)
	overall_avg = round(sum(all_scores) / len(all_scores), 3)

	model_name = model.split("/")[-1]
	leaderboard_data.append(
	{
	"Model": f"[{model_name}](https://openrouter.ai/{model})",
	"Overall Score": overall_avg,
	"High-Resource Score": high_avg,
	"Mid-Resource Score": mid_avg,
	"Low-Resource Score": low_avg,
	"Languages Tested": len(all_scores),
	}
	)

	# Sort by overall BLEU
	df = pd.DataFrame(leaderboard_data)
	df = df.sort_values("Overall Score", ascending=False)

	# Add rank and medals
	df["Rank"] = range(1, len(df) + 1)
	df["Rank"] = df["Rank"].apply(
	lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
	)

	# Reorder columns
	df = df[
	[
	"Rank",
	"Model",
	"Overall Score",
	"High-Resource Score",
	"Mid-Resource Score",
	"Low-Resource Score",
	"Languages Tested",
	]
	]

	return gr.DataFrame(
	value=df,
	label="Model Leaderboard",
	show_search=False,
	datatype=[
	"number",
	"markdown",
	"number",
	"number",
	"number",
	"number",
	"number",
	],
	)


	def create_model_comparison_plot(metric):
	top_languages = sorted(results, key=lambda x: x["speakers"], reverse=True)[:10]

	# Create appropriate title and y-axis label based on metric
	title = f"{metric['display_name']} by Model and Language"
	y_label = metric['label']

	# Flatten the data for the selected metric
	scores_flat = []
	for lang in top_languages:
	for score in lang["scores"]:
	# Get the value directly using the field name
	value = score[metric['field_name']]
	if value is not None:
	scores_flat.append(
	{
	"language": lang["language_name"],
	"model": score["model"],
	"value": value,
	}
	)

	df = pd.DataFrame(scores_flat)
	fig = px.bar(df, x="language", y="value", color="model", barmode="group")
	fig.update_layout(
	title=title,
	xaxis_title=None,
	yaxis_title=y_label,
	barmode="group",
	height=500,
	legend=dict(
	orientation="h", # horizontal orientation
	yanchor="bottom",
	y=-0.3, # position below plot
	xanchor="center",
	x=0.5, # center horizontally
	),
	)
	return fig


	def create_language_stats_df(metric):
	# Create a list to store flattened data
	flat_data = []

	for lang in results:
	# Find the best model and its BLEU score
	best_model = max(
	lang["scores"] or [{"overall_score": None, "model": None}],
	key=lambda x: x["overall_score"],
	)

	model = best_model["model"]
	model_name = model.split("/")[-1] if model else "N/A"
	model_link = (
	f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
	if model
	else "N/A"
	)
	commonvoice_link = (
	f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>"
	if lang["commonvoice_hours"]
	else "N/A"
	)
	row = {
	"Language": f"{lang['language_name']}",
	"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
	# "Models Tested": len(lang["scores"]),
	"Overall": round(lang["overall_score"], 3)
	if lang["overall_score"] is not None
	else "N/A",
	"Trans-lation": round(lang["mt_bleu"], 3)
	if lang["mt_bleu"] is not None
	else "N/A",
	"Classi-fication": round(lang["cls_acc"], 3)
	if lang["cls_acc"] is not None
	else "N/A",
	"MLM": round(lang["mlm_chrf"], 3)
	if lang["mlm_chrf"] is not None
	else "N/A",
	"Best Model": model_link,
	"CommonVoice Hours": commonvoice_link,
	}
	flat_data.append(row)

	df = pd.DataFrame(flat_data)
	return gr.DataFrame(
	value=df,
	label="Language Results",
	show_search="search",
	datatype=[
	"markdown", # Language
	"number", # Speakers
	# "number", # Models Tested
	"number", # Overall
	"number", # Translation
	"number", # Classification
	"number", # MLM
	"markdown", # Best Model
	"markdown", # CommonVoice Hours
	],
	)


	def create_scatter_plot(metric):
	# Filter results to include only languages with sufficient speakers
	filtered_results = [lang for lang in results if lang["speakers"] >= 10_000]

	# Create a list to store data for the scatter plot
	scatter_data = []

	for lang in filtered_results:
	# Calculate average score for this metric across all models
	scores = [
	score[metric['field_name']]
	for score in lang["scores"]
	if score[metric['field_name']] is not None
	]
	if scores: # Only include if we have valid scores
	avg_score = sum(scores) / len(scores)
	scatter_data.append(
	{
	"language": lang["language_name"],
	"speakers": lang["speakers"],
	"score": avg_score,
	}
	)

	fig = go.Figure()

	# Convert speakers to millions for display
	x_vals = [
	data["speakers"] / 1_000_000 for data in scatter_data
	] # Convert to millions
	y_vals = [data["score"] for data in scatter_data]
	labels = [data["language"] for data in scatter_data]

	# Create hover template
	hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"

	fig.add_trace(
	go.Scatter(
	x=x_vals,
	y=y_vals,
	mode="markers+text",
	text=labels,
	textposition="top center",
	hovertemplate=hover_template,
	)
	)

	fig.update_layout(
	title=None,
	xaxis_title="Number of Speakers (Millions)",
	yaxis_title=metric['label'],
	height=500,
	showlegend=False,
	)

	# Use log scale for x-axis since speaker numbers vary widely
	fig.update_xaxes(type="log")

	return fig


	def format_number(n):
	"""Format number with K/M suffix"""
	if n >= 1_000_000:
	return f"{n/1_000_000:.1f}M"
	elif n >= 1_000:
	return f"{n/1_000:.0f}K"
	return str(n)


	def get_population_data():
	import xml.etree.ElementTree as ET

	from language_data.util import data_filename

	filename = data_filename("supplementalData.xml")
	root = ET.fromstring(open(filename).read())
	territories = root.findall("./territoryInfo/territory")

	data = {}
	for territory in territories:
	t_code = territory.attrib["type"]
	t_population = float(territory.attrib["population"])
	data[t_code] = t_population
	return data

	# Helper functions for visualization
	def make_black_bar(value, max_width=10):
	filled = int(value * max_width)
	return "⬛️" * filled + "⬜️" * (max_width - filled)


	def make_colored_bar(score, max_width=10):
	"""Create a colored bar using Unicode blocks based on normalized score
	🟦 for high values (>0.35)
	🟨 for medium values (0.25-0.35)
	🟥 for low values (<0.25)
	⬜ for empty space

	This function handles both normalization and bar creation.
	"""

	# Create the bar based on normalized value
	filled = int(score * max_width)
	filled = max(0, min(filled, max_width))
	empty = max_width - filled

	if score > 0.35:
	return "🟦" * filled + "⬜" * empty
	elif score > 0.25:
	return "🟨" * filled + "⬜" * empty
	else:
	return "🟥" * filled + "⬜" * empty

	def create_world_map(metric):
	# Collect all country data
	population_data = get_population_data()
	country_data = {}
	for lang in results:
	# Skip languages without the required data
	if "population" not in lang or lang[metric['field_name']] is None:
	continue

	for country_code, speakers in lang["population"].items():
	try:
	# Convert alpha_2 (2-letter) to alpha_3 (3-letter) code
	country = pycountry.countries.get(alpha_2=country_code)
	if country is None:
	continue

	iso3_code = country.alpha_3
	if iso3_code not in country_data:
	country_data[iso3_code] = {
	"total_speakers": 0,
	"population": population_data.get(country_code, 0),
	"weighted_score_sum": 0,
	"languages": [],
	}

	country_data[iso3_code]["total_speakers"] += speakers
	country_data[iso3_code]["weighted_score_sum"] += (
	speakers * lang[metric['field_name']]
	)
	country_data[iso3_code]["languages"].append(
	{
	"name": lang["language_name"],
	"speakers": speakers,
	"score": lang[metric['field_name']],
	}
	)
	except (KeyError, AttributeError):
	# Skip invalid or unrecognized country codes
	continue

	# Calculate final weighted averages and prepare hover text
	countries = []
	scores = []
	hover_texts = []

	for country_code, data in country_data.items():
	weighted_avg = data["weighted_score_sum"] / data["total_speakers"]

	try:
	country_name = pycountry.countries.get(alpha_3=country_code).name
	except AttributeError:
	country_name = country_code

	# Sort languages by number of speakers
	langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)

	# Take top 5 languages and summarize the rest
	main_langs = langs[:5]
	other_langs = langs[5:]

	# Create language rows with bars
	lang_rows = []
	for lang in main_langs:
	percentage = (lang["speakers"] / data["population"]) * 100
	speaker_bar = make_black_bar(percentage / 100)

	# Use the integrated make_colored_bar function directly
	score_bar = make_colored_bar(lang["score"])

	lang_rows.append(
	f"<b>{lang['name']}</b><br>"
	f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
	f"{score_bar} {lang['score']:.3f} {metric['label']}<br>"
	)

	# Add summary for other languages if any
	if other_langs:
	other_speakers = sum(lang["speakers"] for lang in other_langs)
	other_percentage = (other_speakers / data["population"]) * 100
	other_avg_score = sum(lang["score"] for lang in other_langs) / len(
	other_langs
	)

	speaker_bar = make_black_bar(other_percentage / 100)

	# Use the integrated make_colored_bar function directly
	score_bar = make_colored_bar(other_avg_score)

	lang_rows.append(
	f"<b>+{len(other_langs)} other languages</b><br>"
	f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
	f"{score_bar} {other_avg_score:.3f} {metric['label']}<br>"
	)

	hover_text = f"<b>{country_name}</b><br><br>" f"{'<br>'.join(lang_rows)}"

	countries.append(country_code)
	scores.append(weighted_avg)
	hover_texts.append(hover_text)

	# Create the choropleth map
	fig = go.Figure(
	data=go.Choropleth(
	locations=countries,
	locationmode="ISO-3",
	z=scores,
	text=hover_texts,
	hoverinfo="text",
	colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
	colorbar=dict(
	title=metric['label'],
	orientation="h", # horizontal orientation
	y=-0.2, # position below map
	yanchor="bottom",
	len=0.5, # length of colorbar
	x=0.5, # center horizontally
	xanchor="center",
	thickness=20, # make it a bit thicker when horizontal
	),
	)
	)

	fig.update_layout(
	title=dict(text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"),
	geo=dict(
	showframe=True,
	showcoastlines=True,
	projection_type="equal earth",
	showland=True,
	landcolor="#f8f9fa",
	coastlinecolor="#e0e0e0",
	countrycolor="#e0e0e0",
	),
	height=600,
	margin=dict(l=0, r=0, t=30, b=0),
	paper_bgcolor="white",
	hoverlabel=dict(
	bgcolor="beige",
	font_size=12,
	),
	)

	return fig

	def create_metric_explanation(metric):
	return gr.Markdown(metric['explanation'])


	# Create the visualization components
	with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
	gr.Markdown("# AI Language Proficiency Benchmark")
	gr.Markdown(
	"Comparing language proficiency across different models and languages."
	)
	start_metric = METRICS["overall_performance"]

	metric = gr.Dropdown(
	choices=[
	metric_info["display_name"]
	for metric_info in METRICS.values()
	],
	value=start_metric["display_name"],
	label="Select Metric",
	interactive=True,
	)
	metric_explanation = create_metric_explanation(start_metric)

	gr.Markdown("## Model Comparison")
	create_leaderboard_df(start_metric)
	model_comparison_plot = gr.Plot(
	value=create_model_comparison_plot(start_metric),
	label="Model Comparison",
	)

	gr.Markdown("## Language Stats")
	create_language_stats_df(start_metric)
	scatter_plot = gr.Plot(
	value=create_scatter_plot(start_metric),
	label="Speaker Population vs. Metric",
	)
	world_map = gr.Plot(
	value=create_world_map(start_metric),
	label="World Map",
	container=False,
	elem_classes="fullwidth-plot",
	)

	gr.Markdown(
	"""
	## Methodology
	### Dataset
	- Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages
	- Each language is tested with the same 100 sentences
	- All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers
	- Language statistics sourced from Ethnologue and Wikidata

	### Models & Evaluation
	- Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed
	- BLEU Score: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better

	### Language Categories
	Languages are divided into three tiers based on translation difficulty:
	- High-Resource: Top 25% of languages by BLEU score (easiest to translate)
	- Mid-Resource: Middle 50% of languages
	- Low-Resource: Bottom 25% of languages (hardest to translate)
	""",
	container=True,
	)

	def update_component(fn, metric_choice):
	metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
	return fn(metric)

	from functools import partial

	# Connect the dropdown to update all plots
	metric.change(fn=partial(update_component, create_metric_explanation), inputs=metric, outputs=metric_explanation)
	metric.change(
	fn=partial(update_component, create_model_comparison_plot), inputs=metric, outputs=model_comparison_plot
	)
	metric.change(
	fn=partial(update_component, create_scatter_plot), inputs=metric, outputs=scatter_plot
	)
	metric.change(
	fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
	)

	demo.launch()