David Pomerenke commited on
Commit
ed78196
Β·
1 Parent(s): 0a5d23d
Files changed (1) hide show
  1. app.py +137 -95
app.py CHANGED
@@ -1,5 +1,6 @@
1
- import gradio as gr
2
  import json
 
 
3
  import pandas as pd
4
  import plotly.graph_objects as go
5
 
@@ -8,6 +9,106 @@ with open("results.json") as f:
8
  results = json.load(f)
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def create_model_comparison_plot(results):
12
  # Extract all unique models
13
  models = set()
@@ -49,6 +150,35 @@ def create_model_comparison_plot(results):
49
  return fig
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def create_scatter_plot(results):
53
  fig = go.Figure()
54
 
@@ -83,96 +213,6 @@ def create_scatter_plot(results):
83
  return fig
84
 
85
 
86
- def create_results_df(results):
87
- # Create a list to store flattened data
88
- flat_data = []
89
-
90
- for lang in results:
91
- # Find the best model and its BLEU score
92
- best_score = max(lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"])
93
-
94
- row = {
95
- "Language": lang["language_name"],
96
- "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
97
- "Models Tested": len(lang["scores"]),
98
- "Average BLEU": round(lang["bleu"], 3) if lang["bleu"] is not None else "N/A",
99
- "Best Model": best_score["model"] if best_score["model"] is not None else "N/A",
100
- "Best Model BLEU": round(best_score["bleu"], 3) if best_score["bleu"] is not None else "N/A",
101
- }
102
- flat_data.append(row)
103
-
104
- return pd.DataFrame(flat_data)
105
-
106
-
107
- def create_leaderboard_df(results):
108
- # Sort languages by average BLEU to determine resource categories
109
- langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
110
- sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True)
111
- n_langs = len(sorted_langs)
112
- high_cutoff = n_langs // 4 # top 25%
113
- low_cutoff = n_langs - n_langs // 4 # bottom 25%
114
-
115
- # Create sets of languages for each category
116
- high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
117
- low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}
118
-
119
- # Get all model scores with categorization
120
- model_scores = {}
121
- for lang in results:
122
- category = ("High-Resource" if lang["language_name"] in high_resource else
123
- "Low-Resource" if lang["language_name"] in low_resource else
124
- "Mid-Resource")
125
-
126
- for score in lang["scores"]:
127
- model_name = score["model"].split("/")[-1]
128
- if model_name not in model_scores:
129
- model_scores[model_name] = {
130
- "High-Resource": [],
131
- "Mid-Resource": [],
132
- "Low-Resource": []
133
- }
134
- model_scores[model_name][category].append(score["bleu"])
135
-
136
- # Calculate average scores and create DataFrame
137
- leaderboard_data = []
138
- for model, categories in model_scores.items():
139
- # Calculate averages for each category
140
- high_avg = round(sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3) if categories["High-Resource"] else 0
141
- mid_avg = round(sum(categories["Mid-Resource"]) / len(categories["Mid-Resource"]), 3) if categories["Mid-Resource"] else 0
142
- low_avg = round(sum(categories["Low-Resource"]) / len(categories["Low-Resource"]), 3) if categories["Low-Resource"] else 0
143
-
144
- # Calculate overall average
145
- all_scores = (categories["High-Resource"] +
146
- categories["Mid-Resource"] +
147
- categories["Low-Resource"])
148
- overall_avg = round(sum(all_scores) / len(all_scores), 3)
149
-
150
- leaderboard_data.append({
151
- "Model": model,
152
- "Overall BLEU": overall_avg,
153
- "High-Resource BLEU": high_avg,
154
- "Mid-Resource BLEU": mid_avg,
155
- "Low-Resource BLEU": low_avg,
156
- "Languages Tested": len(all_scores),
157
- })
158
-
159
- # Sort by overall BLEU
160
- df = pd.DataFrame(leaderboard_data)
161
- df = df.sort_values("Overall BLEU", ascending=False)
162
-
163
- # Add rank and medals
164
- df["Rank"] = range(1, len(df) + 1)
165
- df["Rank"] = df["Rank"].apply(
166
- lambda x: "πŸ₯‡" if x == 1 else "πŸ₯ˆ" if x == 2 else "πŸ₯‰" if x == 3 else str(x)
167
- )
168
-
169
- # Reorder columns
170
- df = df[["Rank", "Model", "Overall BLEU", "High-Resource BLEU",
171
- "Mid-Resource BLEU", "Low-Resource BLEU", "Languages Tested"]]
172
-
173
- return df
174
-
175
-
176
  # Create the visualization components
177
  with gr.Blocks(title="AI Language Translation Benchmark") as demo:
178
  gr.Markdown("# AI Language Translation Benchmark")
@@ -180,7 +220,7 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
180
  "Comparing translation performance across different AI models and languages"
181
  )
182
 
183
- df = create_results_df(results)
184
  leaderboard_df = create_leaderboard_df(results)
185
  bar_plot = create_model_comparison_plot(results)
186
  scatter_plot = create_scatter_plot(results)
@@ -190,8 +230,8 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
190
  gr.DataFrame(value=df, label="Language Results", show_search="search")
191
  gr.Plot(value=scatter_plot, label="Language Coverage")
192
 
193
-
194
- gr.Markdown("""
195
  ## Methodology
196
  ### Dataset
197
  - Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages
@@ -208,6 +248,8 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
208
  - High-Resource: Top 25% of languages by BLEU score (easiest to translate)
209
  - Mid-Resource: Middle 50% of languages
210
  - Low-Resource: Bottom 25% of languages (hardest to translate)
211
- """, container=True)
 
 
212
 
213
  demo.launch()
 
 
1
  import json
2
+
3
+ import gradio as gr
4
  import pandas as pd
5
  import plotly.graph_objects as go
6
 
 
9
  results = json.load(f)
10
 
11
 
12
+ def create_leaderboard_df(results):
13
+ # Sort languages by average BLEU to determine resource categories
14
+ langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
15
+ sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True)
16
+ n_langs = len(sorted_langs)
17
+ high_cutoff = n_langs // 4 # top 25%
18
+ low_cutoff = n_langs - n_langs // 4 # bottom 25%
19
+
20
+ # Create sets of languages for each category
21
+ high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
22
+ low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}
23
+
24
+ # Get all model scores with categorization
25
+ model_scores = {}
26
+ for lang in results:
27
+ category = (
28
+ "High-Resource"
29
+ if lang["language_name"] in high_resource
30
+ else "Low-Resource"
31
+ if lang["language_name"] in low_resource
32
+ else "Mid-Resource"
33
+ )
34
+
35
+ for score in lang["scores"]:
36
+ model_name = score["model"].split("/")[-1]
37
+ if model_name not in model_scores:
38
+ model_scores[model_name] = {
39
+ "High-Resource": [],
40
+ "Mid-Resource": [],
41
+ "Low-Resource": [],
42
+ }
43
+ model_scores[model_name][category].append(score["bleu"])
44
+
45
+ # Calculate average scores and create DataFrame
46
+ leaderboard_data = []
47
+ for model, categories in model_scores.items():
48
+ # Calculate averages for each category
49
+ high_avg = (
50
+ round(
51
+ sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3
52
+ )
53
+ if categories["High-Resource"]
54
+ else 0
55
+ )
56
+ mid_avg = (
57
+ round(sum(categories["Mid-Resource"]) / len(categories["Mid-Resource"]), 3)
58
+ if categories["Mid-Resource"]
59
+ else 0
60
+ )
61
+ low_avg = (
62
+ round(sum(categories["Low-Resource"]) / len(categories["Low-Resource"]), 3)
63
+ if categories["Low-Resource"]
64
+ else 0
65
+ )
66
+
67
+ # Calculate overall average
68
+ all_scores = (
69
+ categories["High-Resource"]
70
+ + categories["Mid-Resource"]
71
+ + categories["Low-Resource"]
72
+ )
73
+ overall_avg = round(sum(all_scores) / len(all_scores), 3)
74
+
75
+ leaderboard_data.append(
76
+ {
77
+ "Model": model,
78
+ "Overall BLEU": overall_avg,
79
+ "High-Resource BLEU": high_avg,
80
+ "Mid-Resource BLEU": mid_avg,
81
+ "Low-Resource BLEU": low_avg,
82
+ "Languages Tested": len(all_scores),
83
+ }
84
+ )
85
+
86
+ # Sort by overall BLEU
87
+ df = pd.DataFrame(leaderboard_data)
88
+ df = df.sort_values("Overall BLEU", ascending=False)
89
+
90
+ # Add rank and medals
91
+ df["Rank"] = range(1, len(df) + 1)
92
+ df["Rank"] = df["Rank"].apply(
93
+ lambda x: "πŸ₯‡" if x == 1 else "πŸ₯ˆ" if x == 2 else "πŸ₯‰" if x == 3 else str(x)
94
+ )
95
+
96
+ # Reorder columns
97
+ df = df[
98
+ [
99
+ "Rank",
100
+ "Model",
101
+ "Overall BLEU",
102
+ "High-Resource BLEU",
103
+ "Mid-Resource BLEU",
104
+ "Low-Resource BLEU",
105
+ "Languages Tested",
106
+ ]
107
+ ]
108
+
109
+ return df
110
+
111
+
112
  def create_model_comparison_plot(results):
113
  # Extract all unique models
114
  models = set()
 
150
  return fig
151
 
152
 
153
+ def create_language_stats_df(results):
154
+ # Create a list to store flattened data
155
+ flat_data = []
156
+
157
+ for lang in results:
158
+ # Find the best model and its BLEU score
159
+ best_score = max(
160
+ lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
161
+ )
162
+
163
+ row = {
164
+ "Language": lang["language_name"],
165
+ "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
166
+ "Models Tested": len(lang["scores"]),
167
+ "Average BLEU": round(lang["bleu"], 3)
168
+ if lang["bleu"] is not None
169
+ else "N/A",
170
+ "Best Model": best_score["model"]
171
+ if best_score["model"] is not None
172
+ else "N/A",
173
+ "Best Model BLEU": round(best_score["bleu"], 3)
174
+ if best_score["bleu"] is not None
175
+ else "N/A",
176
+ }
177
+ flat_data.append(row)
178
+
179
+ return pd.DataFrame(flat_data)
180
+
181
+
182
  def create_scatter_plot(results):
183
  fig = go.Figure()
184
 
 
213
  return fig
214
 
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  # Create the visualization components
217
  with gr.Blocks(title="AI Language Translation Benchmark") as demo:
218
  gr.Markdown("# AI Language Translation Benchmark")
 
220
  "Comparing translation performance across different AI models and languages"
221
  )
222
 
223
+ df = create_language_stats_df(results)
224
  leaderboard_df = create_leaderboard_df(results)
225
  bar_plot = create_model_comparison_plot(results)
226
  scatter_plot = create_scatter_plot(results)
 
230
  gr.DataFrame(value=df, label="Language Results", show_search="search")
231
  gr.Plot(value=scatter_plot, label="Language Coverage")
232
 
233
+ gr.Markdown(
234
+ """
235
  ## Methodology
236
  ### Dataset
237
  - Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages
 
248
  - High-Resource: Top 25% of languages by BLEU score (easiest to translate)
249
  - Mid-Resource: Middle 50% of languages
250
  - Low-Resource: Bottom 25% of languages (hardest to translate)
251
+ """,
252
+ container=True,
253
+ )
254
 
255
  demo.launch()