import json import gradio as gr import pandas as pd import plotly.graph_objects as go # Load and process results with open("results.json") as f: results = json.load(f) def create_leaderboard_df(results): # Sort languages by average BLEU to determine resource categories langs_with_bleu = [lang for lang in results if lang["bleu"] is not None] sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True) n_langs = len(sorted_langs) high_cutoff = n_langs // 4 # top 25% low_cutoff = n_langs - n_langs // 4 # bottom 25% # Create sets of languages for each category high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]} low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]} # Get all model scores with categorization model_scores = {} for lang in results: category = ( "High-Resource" if lang["language_name"] in high_resource else "Low-Resource" if lang["language_name"] in low_resource else "Mid-Resource" ) for score in lang["scores"]: model_name = score["model"].split("/")[-1] if model_name not in model_scores: model_scores[model_name] = { "High-Resource": [], "Mid-Resource": [], "Low-Resource": [], } model_scores[model_name][category].append(score["bleu"]) # Calculate average scores and create DataFrame leaderboard_data = [] for model, categories in model_scores.items(): # Calculate averages for each category high_avg = ( round( sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3 ) if categories["High-Resource"] else 0 ) mid_avg = ( round(sum(categories["Mid-Resource"]) / len(categories["Mid-Resource"]), 3) if categories["Mid-Resource"] else 0 ) low_avg = ( round(sum(categories["Low-Resource"]) / len(categories["Low-Resource"]), 3) if categories["Low-Resource"] else 0 ) # Calculate overall average all_scores = ( categories["High-Resource"] + categories["Mid-Resource"] + categories["Low-Resource"] ) overall_avg = round(sum(all_scores) / len(all_scores), 3) leaderboard_data.append( { "Model": model, "Overall BLEU": overall_avg, "High-Resource BLEU": high_avg, "Mid-Resource BLEU": mid_avg, "Low-Resource BLEU": low_avg, "Languages Tested": len(all_scores), } ) # Sort by overall BLEU df = pd.DataFrame(leaderboard_data) df = df.sort_values("Overall BLEU", ascending=False) # Add rank and medals df["Rank"] = range(1, len(df) + 1) df["Rank"] = df["Rank"].apply( lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x) ) # Reorder columns df = df[ [ "Rank", "Model", "Overall BLEU", "High-Resource BLEU", "Mid-Resource BLEU", "Low-Resource BLEU", "Languages Tested", ] ] return df def create_model_comparison_plot(results): # Extract all unique models models = set() for lang in results: for score in lang["scores"]: models.add(score["model"]) models = list(models) # Create traces for each model traces = [] for model in models: x_vals = [] # languages y_vals = [] # BLEU scores for lang in results: model_score = next( (s["bleu"] for s in lang["scores"] if s["model"] == model), None ) if model_score is not None: x_vals.append(lang["language_name"]) y_vals.append(model_score) traces.append( go.Bar( name=model.split("/")[-1], x=x_vals, y=y_vals, ) ) fig = go.Figure(data=traces) fig.update_layout( title="BLEU Scores by Model and Language", xaxis_title="Language", yaxis_title="BLEU Score", barmode="group", height=500, ) return fig def create_language_stats_df(results): # Create a list to store flattened data flat_data = [] for lang in results: # Find the best model and its BLEU score best_score = max( lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"] ) row = { "Language": lang["language_name"], "Speakers (M)": round(lang["speakers"] / 1_000_000, 1), "Models Tested": len(lang["scores"]), "Average BLEU": round(lang["bleu"], 3) if lang["bleu"] is not None else "N/A", "Best Model": best_score["model"] if best_score["model"] is not None else "N/A", "Best Model BLEU": round(best_score["bleu"], 3) if best_score["bleu"] is not None else "N/A", } flat_data.append(row) return pd.DataFrame(flat_data) def create_scatter_plot(results): fig = go.Figure() x_vals = [lang["speakers"] / 1_000_000 for lang in results] # Convert to millions y_vals = [lang["bleu"] for lang in results] labels = [lang["language_name"] for lang in results] fig.add_trace( go.Scatter( x=x_vals, y=y_vals, mode="markers+text", text=labels, textposition="top center", hovertemplate="%{text}
" + "Speakers: %{x:.1f}M
" + "BLEU Score: %{y:.3f}", ) ) fig.update_layout( title="Language Coverage: Speakers vs BLEU Score", xaxis_title="Number of Speakers (Millions)", yaxis_title="Average BLEU Score", height=500, showlegend=False, ) # Use log scale for x-axis since speaker numbers vary widely fig.update_xaxes(type="log") return fig # Create the visualization components with gr.Blocks(title="AI Language Translation Benchmark") as demo: gr.Markdown("# AI Language Translation Benchmark") gr.Markdown( "Comparing translation performance across different AI models and languages" ) df = create_language_stats_df(results) leaderboard_df = create_leaderboard_df(results) bar_plot = create_model_comparison_plot(results) scatter_plot = create_scatter_plot(results) gr.DataFrame(value=leaderboard_df, label="Model Leaderboard", show_search=False) gr.Plot(value=bar_plot, label="Model Comparison") gr.DataFrame(value=df, label="Language Results", show_search="search") gr.Plot(value=scatter_plot, label="Language Coverage") gr.Markdown( """ ## Methodology ### Dataset - Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages - Each language is tested with the same 100 sentences - All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers - Language statistics sourced from Ethnologue and Wikidata ### Models & Evaluation - Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed - **BLEU Score**: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better ### Language Categories Languages are divided into three tiers based on translation difficulty: - High-Resource: Top 25% of languages by BLEU score (easiest to translate) - Mid-Resource: Middle 50% of languages - Low-Resource: Bottom 25% of languages (hardest to translate) """, container=True, ) demo.launch()