David Pomerenke
commited on
Commit
·
df383f6
1
Parent(s):
ed78196
Add links to OpenRouter
Browse files
app.py
CHANGED
@@ -4,11 +4,14 @@ import gradio as gr
|
|
4 |
import pandas as pd
|
5 |
import plotly.graph_objects as go
|
6 |
|
7 |
-
# Load and process results
|
8 |
with open("results.json") as f:
|
9 |
results = json.load(f)
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
12 |
def create_leaderboard_df(results):
|
13 |
# Sort languages by average BLEU to determine resource categories
|
14 |
langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
|
@@ -33,33 +36,31 @@ def create_leaderboard_df(results):
|
|
33 |
)
|
34 |
|
35 |
for score in lang["scores"]:
|
36 |
-
|
37 |
-
if
|
38 |
-
model_scores[
|
39 |
"High-Resource": [],
|
40 |
"Mid-Resource": [],
|
41 |
"Low-Resource": [],
|
42 |
}
|
43 |
-
model_scores[
|
44 |
|
45 |
# Calculate average scores and create DataFrame
|
46 |
leaderboard_data = []
|
47 |
for model, categories in model_scores.items():
|
48 |
# Calculate averages for each category
|
49 |
high_avg = (
|
50 |
-
round(
|
51 |
-
sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3
|
52 |
-
)
|
53 |
if categories["High-Resource"]
|
54 |
else 0
|
55 |
)
|
56 |
mid_avg = (
|
57 |
-
round(
|
58 |
if categories["Mid-Resource"]
|
59 |
else 0
|
60 |
)
|
61 |
low_avg = (
|
62 |
-
round(
|
63 |
if categories["Low-Resource"]
|
64 |
else 0
|
65 |
)
|
@@ -72,9 +73,10 @@ def create_leaderboard_df(results):
|
|
72 |
)
|
73 |
overall_avg = round(sum(all_scores) / len(all_scores), 3)
|
74 |
|
|
|
75 |
leaderboard_data.append(
|
76 |
{
|
77 |
-
"Model": model,
|
78 |
"Overall BLEU": overall_avg,
|
79 |
"High-Resource BLEU": high_avg,
|
80 |
"Mid-Resource BLEU": mid_avg,
|
@@ -106,7 +108,20 @@ def create_leaderboard_df(results):
|
|
106 |
]
|
107 |
]
|
108 |
|
109 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
|
112 |
def create_model_comparison_plot(results):
|
@@ -160,23 +175,30 @@ def create_language_stats_df(results):
|
|
160 |
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
|
161 |
)
|
162 |
|
|
|
|
|
|
|
163 |
row = {
|
164 |
-
"Language": lang[
|
165 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
166 |
"Models Tested": len(lang["scores"]),
|
167 |
"Average BLEU": round(lang["bleu"], 3)
|
168 |
if lang["bleu"] is not None
|
169 |
else "N/A",
|
170 |
-
"Best Model":
|
171 |
-
if best_score["model"] is not None
|
172 |
-
else "N/A",
|
173 |
"Best Model BLEU": round(best_score["bleu"], 3)
|
174 |
if best_score["bleu"] is not None
|
175 |
else "N/A",
|
176 |
}
|
177 |
flat_data.append(row)
|
178 |
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
|
182 |
def create_scatter_plot(results):
|
@@ -220,14 +242,12 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
|
|
220 |
"Comparing translation performance across different AI models and languages"
|
221 |
)
|
222 |
|
223 |
-
df = create_language_stats_df(results)
|
224 |
-
leaderboard_df = create_leaderboard_df(results)
|
225 |
bar_plot = create_model_comparison_plot(results)
|
226 |
scatter_plot = create_scatter_plot(results)
|
227 |
|
228 |
-
|
229 |
gr.Plot(value=bar_plot, label="Model Comparison")
|
230 |
-
|
231 |
gr.Plot(value=scatter_plot, label="Language Coverage")
|
232 |
|
233 |
gr.Markdown(
|
|
|
4 |
import pandas as pd
|
5 |
import plotly.graph_objects as go
|
6 |
|
|
|
7 |
with open("results.json") as f:
|
8 |
results = json.load(f)
|
9 |
|
10 |
|
11 |
+
def mean(lst):
|
12 |
+
return sum(lst) / len(lst)
|
13 |
+
|
14 |
+
|
15 |
def create_leaderboard_df(results):
|
16 |
# Sort languages by average BLEU to determine resource categories
|
17 |
langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
|
|
|
36 |
)
|
37 |
|
38 |
for score in lang["scores"]:
|
39 |
+
model = score["model"]
|
40 |
+
if model not in model_scores:
|
41 |
+
model_scores[model] = {
|
42 |
"High-Resource": [],
|
43 |
"Mid-Resource": [],
|
44 |
"Low-Resource": [],
|
45 |
}
|
46 |
+
model_scores[model][category].append(score["bleu"])
|
47 |
|
48 |
# Calculate average scores and create DataFrame
|
49 |
leaderboard_data = []
|
50 |
for model, categories in model_scores.items():
|
51 |
# Calculate averages for each category
|
52 |
high_avg = (
|
53 |
+
round(mean(categories["High-Resource"]), 3)
|
|
|
|
|
54 |
if categories["High-Resource"]
|
55 |
else 0
|
56 |
)
|
57 |
mid_avg = (
|
58 |
+
round(mean(categories["Mid-Resource"]), 3)
|
59 |
if categories["Mid-Resource"]
|
60 |
else 0
|
61 |
)
|
62 |
low_avg = (
|
63 |
+
round(mean(categories["Low-Resource"]), 3)
|
64 |
if categories["Low-Resource"]
|
65 |
else 0
|
66 |
)
|
|
|
73 |
)
|
74 |
overall_avg = round(sum(all_scores) / len(all_scores), 3)
|
75 |
|
76 |
+
model_name = model.split("/")[-1]
|
77 |
leaderboard_data.append(
|
78 |
{
|
79 |
+
"Model": f"[{model_name}](https://openrouter.ai/{model})",
|
80 |
"Overall BLEU": overall_avg,
|
81 |
"High-Resource BLEU": high_avg,
|
82 |
"Mid-Resource BLEU": mid_avg,
|
|
|
108 |
]
|
109 |
]
|
110 |
|
111 |
+
return gr.DataFrame(
|
112 |
+
value=df,
|
113 |
+
label="Model Leaderboard",
|
114 |
+
show_search=False,
|
115 |
+
datatype=[
|
116 |
+
"number",
|
117 |
+
"markdown",
|
118 |
+
"number",
|
119 |
+
"number",
|
120 |
+
"number",
|
121 |
+
"number",
|
122 |
+
"number",
|
123 |
+
],
|
124 |
+
)
|
125 |
|
126 |
|
127 |
def create_model_comparison_plot(results):
|
|
|
175 |
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
|
176 |
)
|
177 |
|
178 |
+
model = best_score['model']
|
179 |
+
model_name = model.split('/')[-1] if model else "N/A"
|
180 |
+
model_link = f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" if model else "N/A"
|
181 |
row = {
|
182 |
+
"Language": f"**{lang['language_name']}**",
|
183 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
184 |
"Models Tested": len(lang["scores"]),
|
185 |
"Average BLEU": round(lang["bleu"], 3)
|
186 |
if lang["bleu"] is not None
|
187 |
else "N/A",
|
188 |
+
"Best Model": model_link,
|
|
|
|
|
189 |
"Best Model BLEU": round(best_score["bleu"], 3)
|
190 |
if best_score["bleu"] is not None
|
191 |
else "N/A",
|
192 |
}
|
193 |
flat_data.append(row)
|
194 |
|
195 |
+
df = pd.DataFrame(flat_data)
|
196 |
+
return gr.DataFrame(
|
197 |
+
value=df,
|
198 |
+
label="Language Results",
|
199 |
+
show_search="search",
|
200 |
+
datatype=["markdown", "number", "number", "number", "markdown", "number"],
|
201 |
+
)
|
202 |
|
203 |
|
204 |
def create_scatter_plot(results):
|
|
|
242 |
"Comparing translation performance across different AI models and languages"
|
243 |
)
|
244 |
|
|
|
|
|
245 |
bar_plot = create_model_comparison_plot(results)
|
246 |
scatter_plot = create_scatter_plot(results)
|
247 |
|
248 |
+
create_leaderboard_df(results)
|
249 |
gr.Plot(value=bar_plot, label="Model Comparison")
|
250 |
+
create_language_stats_df(results)
|
251 |
gr.Plot(value=scatter_plot, label="Language Coverage")
|
252 |
|
253 |
gr.Markdown(
|