David Pomerenke commited on
Commit
df383f6
·
1 Parent(s): ed78196

Add links to OpenRouter

Browse files
Files changed (1) hide show
  1. app.py +41 -21
app.py CHANGED
@@ -4,11 +4,14 @@ import gradio as gr
4
  import pandas as pd
5
  import plotly.graph_objects as go
6
 
7
- # Load and process results
8
  with open("results.json") as f:
9
  results = json.load(f)
10
 
11
 
 
 
 
 
12
  def create_leaderboard_df(results):
13
  # Sort languages by average BLEU to determine resource categories
14
  langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
@@ -33,33 +36,31 @@ def create_leaderboard_df(results):
33
  )
34
 
35
  for score in lang["scores"]:
36
- model_name = score["model"].split("/")[-1]
37
- if model_name not in model_scores:
38
- model_scores[model_name] = {
39
  "High-Resource": [],
40
  "Mid-Resource": [],
41
  "Low-Resource": [],
42
  }
43
- model_scores[model_name][category].append(score["bleu"])
44
 
45
  # Calculate average scores and create DataFrame
46
  leaderboard_data = []
47
  for model, categories in model_scores.items():
48
  # Calculate averages for each category
49
  high_avg = (
50
- round(
51
- sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3
52
- )
53
  if categories["High-Resource"]
54
  else 0
55
  )
56
  mid_avg = (
57
- round(sum(categories["Mid-Resource"]) / len(categories["Mid-Resource"]), 3)
58
  if categories["Mid-Resource"]
59
  else 0
60
  )
61
  low_avg = (
62
- round(sum(categories["Low-Resource"]) / len(categories["Low-Resource"]), 3)
63
  if categories["Low-Resource"]
64
  else 0
65
  )
@@ -72,9 +73,10 @@ def create_leaderboard_df(results):
72
  )
73
  overall_avg = round(sum(all_scores) / len(all_scores), 3)
74
 
 
75
  leaderboard_data.append(
76
  {
77
- "Model": model,
78
  "Overall BLEU": overall_avg,
79
  "High-Resource BLEU": high_avg,
80
  "Mid-Resource BLEU": mid_avg,
@@ -106,7 +108,20 @@ def create_leaderboard_df(results):
106
  ]
107
  ]
108
 
109
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
  def create_model_comparison_plot(results):
@@ -160,23 +175,30 @@ def create_language_stats_df(results):
160
  lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
161
  )
162
 
 
 
 
163
  row = {
164
- "Language": lang["language_name"],
165
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
166
  "Models Tested": len(lang["scores"]),
167
  "Average BLEU": round(lang["bleu"], 3)
168
  if lang["bleu"] is not None
169
  else "N/A",
170
- "Best Model": best_score["model"]
171
- if best_score["model"] is not None
172
- else "N/A",
173
  "Best Model BLEU": round(best_score["bleu"], 3)
174
  if best_score["bleu"] is not None
175
  else "N/A",
176
  }
177
  flat_data.append(row)
178
 
179
- return pd.DataFrame(flat_data)
 
 
 
 
 
 
180
 
181
 
182
  def create_scatter_plot(results):
@@ -220,14 +242,12 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
220
  "Comparing translation performance across different AI models and languages"
221
  )
222
 
223
- df = create_language_stats_df(results)
224
- leaderboard_df = create_leaderboard_df(results)
225
  bar_plot = create_model_comparison_plot(results)
226
  scatter_plot = create_scatter_plot(results)
227
 
228
- gr.DataFrame(value=leaderboard_df, label="Model Leaderboard", show_search=False)
229
  gr.Plot(value=bar_plot, label="Model Comparison")
230
- gr.DataFrame(value=df, label="Language Results", show_search="search")
231
  gr.Plot(value=scatter_plot, label="Language Coverage")
232
 
233
  gr.Markdown(
 
4
  import pandas as pd
5
  import plotly.graph_objects as go
6
 
 
7
  with open("results.json") as f:
8
  results = json.load(f)
9
 
10
 
11
+ def mean(lst):
12
+ return sum(lst) / len(lst)
13
+
14
+
15
  def create_leaderboard_df(results):
16
  # Sort languages by average BLEU to determine resource categories
17
  langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
 
36
  )
37
 
38
  for score in lang["scores"]:
39
+ model = score["model"]
40
+ if model not in model_scores:
41
+ model_scores[model] = {
42
  "High-Resource": [],
43
  "Mid-Resource": [],
44
  "Low-Resource": [],
45
  }
46
+ model_scores[model][category].append(score["bleu"])
47
 
48
  # Calculate average scores and create DataFrame
49
  leaderboard_data = []
50
  for model, categories in model_scores.items():
51
  # Calculate averages for each category
52
  high_avg = (
53
+ round(mean(categories["High-Resource"]), 3)
 
 
54
  if categories["High-Resource"]
55
  else 0
56
  )
57
  mid_avg = (
58
+ round(mean(categories["Mid-Resource"]), 3)
59
  if categories["Mid-Resource"]
60
  else 0
61
  )
62
  low_avg = (
63
+ round(mean(categories["Low-Resource"]), 3)
64
  if categories["Low-Resource"]
65
  else 0
66
  )
 
73
  )
74
  overall_avg = round(sum(all_scores) / len(all_scores), 3)
75
 
76
+ model_name = model.split("/")[-1]
77
  leaderboard_data.append(
78
  {
79
+ "Model": f"[{model_name}](https://openrouter.ai/{model})",
80
  "Overall BLEU": overall_avg,
81
  "High-Resource BLEU": high_avg,
82
  "Mid-Resource BLEU": mid_avg,
 
108
  ]
109
  ]
110
 
111
+ return gr.DataFrame(
112
+ value=df,
113
+ label="Model Leaderboard",
114
+ show_search=False,
115
+ datatype=[
116
+ "number",
117
+ "markdown",
118
+ "number",
119
+ "number",
120
+ "number",
121
+ "number",
122
+ "number",
123
+ ],
124
+ )
125
 
126
 
127
  def create_model_comparison_plot(results):
 
175
  lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
176
  )
177
 
178
+ model = best_score['model']
179
+ model_name = model.split('/')[-1] if model else "N/A"
180
+ model_link = f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" if model else "N/A"
181
  row = {
182
+ "Language": f"**{lang['language_name']}**",
183
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
184
  "Models Tested": len(lang["scores"]),
185
  "Average BLEU": round(lang["bleu"], 3)
186
  if lang["bleu"] is not None
187
  else "N/A",
188
+ "Best Model": model_link,
 
 
189
  "Best Model BLEU": round(best_score["bleu"], 3)
190
  if best_score["bleu"] is not None
191
  else "N/A",
192
  }
193
  flat_data.append(row)
194
 
195
+ df = pd.DataFrame(flat_data)
196
+ return gr.DataFrame(
197
+ value=df,
198
+ label="Language Results",
199
+ show_search="search",
200
+ datatype=["markdown", "number", "number", "number", "markdown", "number"],
201
+ )
202
 
203
 
204
  def create_scatter_plot(results):
 
242
  "Comparing translation performance across different AI models and languages"
243
  )
244
 
 
 
245
  bar_plot = create_model_comparison_plot(results)
246
  scatter_plot = create_scatter_plot(results)
247
 
248
+ create_leaderboard_df(results)
249
  gr.Plot(value=bar_plot, label="Model Comparison")
250
+ create_language_stats_df(results)
251
  gr.Plot(value=scatter_plot, label="Language Coverage")
252
 
253
  gr.Markdown(