David Pomerenke
commited on
Commit
·
ba2a2f0
1
Parent(s):
e32fd78
UI logic for selecting model type and metric
Browse files
app.py
CHANGED
@@ -87,7 +87,7 @@ METRICS = {
|
|
87 |
""",
|
88 |
},
|
89 |
{
|
90 |
-
"display_name": "Automatic Speech Recognition ChrF",
|
91 |
"field_name": "asr_chrf",
|
92 |
"label": "ChrF",
|
93 |
"explanation": """
|
@@ -104,123 +104,50 @@ def mean(lst):
|
|
104 |
return sum(lst) / len(lst)
|
105 |
|
106 |
|
107 |
-
def create_leaderboard_df(metric):
|
108 |
-
|
109 |
-
|
110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
]
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
114 |
)
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
"High-Resource"
|
128 |
-
if lang["language_name"] in high_resource
|
129 |
-
else "Low-Resource"
|
130 |
-
if lang["language_name"] in low_resource
|
131 |
-
else "Mid-Resource"
|
132 |
-
)
|
133 |
-
|
134 |
-
for score in lang["scores"]:
|
135 |
-
model = score["model"]
|
136 |
-
if model not in model_scores:
|
137 |
-
model_scores[model] = {
|
138 |
-
"High-Resource": [],
|
139 |
-
"Mid-Resource": [],
|
140 |
-
"Low-Resource": [],
|
141 |
-
}
|
142 |
-
# Check if the metric field exists in the score dictionary before accessing it
|
143 |
-
if metric["field_name"] in score:
|
144 |
-
model_scores[model][category].append(score[metric["field_name"]])
|
145 |
-
# If the metric is missing, we'll skip this score
|
146 |
-
|
147 |
-
# Calculate average scores and create DataFrame
|
148 |
-
leaderboard_data = []
|
149 |
-
for model, categories in model_scores.items():
|
150 |
-
# Calculate averages for each category
|
151 |
-
high_avg = (
|
152 |
-
round(mean(categories["High-Resource"]), 3)
|
153 |
-
if categories["High-Resource"]
|
154 |
-
else 0
|
155 |
-
)
|
156 |
-
mid_avg = (
|
157 |
-
round(mean(categories["Mid-Resource"]), 3)
|
158 |
-
if categories["Mid-Resource"]
|
159 |
-
else 0
|
160 |
-
)
|
161 |
-
low_avg = (
|
162 |
-
round(mean(categories["Low-Resource"]), 3)
|
163 |
-
if categories["Low-Resource"]
|
164 |
-
else 0
|
165 |
-
)
|
166 |
-
|
167 |
-
# Calculate overall average
|
168 |
-
all_scores = (
|
169 |
-
categories["High-Resource"]
|
170 |
-
+ categories["Mid-Resource"]
|
171 |
-
+ categories["Low-Resource"]
|
172 |
-
)
|
173 |
-
# Check if all_scores is empty to avoid division by zero
|
174 |
-
overall_avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else 0
|
175 |
-
|
176 |
-
model_name = model.split("/")[-1]
|
177 |
-
leaderboard_data.append(
|
178 |
-
{
|
179 |
-
"Model": f"[{model_name}](https://openrouter.ai/{model})",
|
180 |
-
"Overall Score": overall_avg,
|
181 |
-
"High-Resource Score": high_avg,
|
182 |
-
"Mid-Resource Score": mid_avg,
|
183 |
-
"Low-Resource Score": low_avg,
|
184 |
-
"Languages Tested": len(all_scores),
|
185 |
-
}
|
186 |
-
)
|
187 |
-
|
188 |
-
# Sort by overall BLEU
|
189 |
-
df = pd.DataFrame(leaderboard_data)
|
190 |
-
df = df.sort_values("Overall Score", ascending=False)
|
191 |
-
|
192 |
-
# Add rank and medals
|
193 |
df["Rank"] = range(1, len(df) + 1)
|
194 |
df["Rank"] = df["Rank"].apply(
|
195 |
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
|
196 |
)
|
197 |
-
|
198 |
-
# Reorder columns
|
199 |
-
df = df[
|
200 |
-
[
|
201 |
-
"Rank",
|
202 |
-
"Model",
|
203 |
-
"Overall Score",
|
204 |
-
"High-Resource Score",
|
205 |
-
"Mid-Resource Score",
|
206 |
-
"Low-Resource Score",
|
207 |
-
"Languages Tested",
|
208 |
-
]
|
209 |
-
]
|
210 |
-
|
211 |
return gr.DataFrame(
|
212 |
value=df,
|
213 |
label="Model Leaderboard",
|
214 |
show_search=False,
|
215 |
-
datatype=[
|
216 |
-
"number",
|
217 |
-
"markdown",
|
218 |
-
"number",
|
219 |
-
"number",
|
220 |
-
"number",
|
221 |
-
"number",
|
222 |
-
"number",
|
223 |
-
],
|
224 |
)
|
225 |
|
226 |
|
@@ -292,7 +219,7 @@ def create_language_stats_df(metric):
|
|
292 |
else "N/A"
|
293 |
)
|
294 |
commonvoice_link = (
|
295 |
-
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>"
|
296 |
if lang["commonvoice_hours"]
|
297 |
else "N/A"
|
298 |
)
|
@@ -303,18 +230,18 @@ def create_language_stats_df(metric):
|
|
303 |
# "Overall": round(lang["overall_score"], 3)
|
304 |
# if lang["overall_score"] is not None
|
305 |
# else "N/A",
|
306 |
-
"
|
307 |
-
|
|
|
308 |
else "N/A",
|
309 |
-
"
|
310 |
if lang["cls_acc"] is not None
|
311 |
else "N/A",
|
312 |
"MLM": round(lang["mlm_chrf"], 3)
|
313 |
if lang["mlm_chrf"] is not None
|
314 |
else "N/A",
|
315 |
-
"ASR": round(lang["
|
316 |
-
"
|
317 |
-
"CommonVoice Hours": commonvoice_link,
|
318 |
}
|
319 |
flat_data.append(row)
|
320 |
|
@@ -327,40 +254,36 @@ def create_language_stats_df(metric):
|
|
327 |
column_widths=[
|
328 |
"100px",
|
329 |
"100px",
|
330 |
-
"100px",
|
331 |
-
"100px",
|
332 |
-
"
|
333 |
-
"100px",
|
334 |
-
"100px",
|
335 |
-
"100px",
|
336 |
-
"100px",
|
337 |
-
"100px",
|
338 |
],
|
339 |
datatype=[
|
340 |
"markdown", # Language
|
341 |
"number", # Speakers
|
342 |
# "number", # Models Tested
|
343 |
-
"number", # Overall
|
|
|
344 |
"number", # Translation
|
345 |
"number", # Classification
|
346 |
"number", # MLM
|
347 |
"number", # ASR
|
348 |
-
"markdown", # Best Model
|
349 |
"markdown", # CommonVoice Hours
|
350 |
],
|
351 |
)
|
352 |
|
353 |
|
354 |
def create_scatter_plot(metric):
|
355 |
-
# Filter results to include only languages with sufficient speakers
|
356 |
-
filtered_results = [
|
357 |
-
lang for lang in languages_with_scores if lang["speakers"] >= 10_000
|
358 |
-
]
|
359 |
-
|
360 |
# Create a list to store data for the scatter plot
|
361 |
scatter_data = []
|
362 |
-
|
363 |
-
|
|
|
364 |
# Calculate average score for this metric across all models
|
365 |
scores = [
|
366 |
score[metric["field_name"]]
|
@@ -374,32 +297,44 @@ def create_scatter_plot(metric):
|
|
374 |
"language": lang["language_name"],
|
375 |
"speakers": lang["speakers"],
|
376 |
"score": avg_score,
|
|
|
377 |
}
|
378 |
)
|
379 |
|
380 |
fig = go.Figure()
|
381 |
-
|
382 |
-
# Convert speakers to millions for display
|
383 |
-
x_vals = [
|
384 |
-
data["speakers"] / 1_000_000 for data in scatter_data
|
385 |
-
] # Convert to millions
|
386 |
y_vals = [data["score"] for data in scatter_data]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
387 |
labels = [data["language"] for data in scatter_data]
|
388 |
-
|
389 |
-
# Create hover template
|
390 |
hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
|
391 |
-
|
392 |
fig.add_trace(
|
393 |
go.Scatter(
|
394 |
x=x_vals,
|
395 |
y=y_vals,
|
|
|
396 |
mode="markers+text",
|
397 |
text=labels,
|
398 |
textposition="top center",
|
399 |
hovertemplate=hover_template,
|
400 |
)
|
401 |
)
|
402 |
-
|
403 |
fig.update_layout(
|
404 |
title=None,
|
405 |
xaxis_title="Number of Speakers (Millions)",
|
@@ -407,10 +342,7 @@ def create_scatter_plot(metric):
|
|
407 |
height=500,
|
408 |
showlegend=False,
|
409 |
)
|
410 |
-
|
411 |
-
# Use log scale for x-axis since speaker numbers vary widely
|
412 |
fig.update_xaxes(type="log")
|
413 |
-
|
414 |
return fig
|
415 |
|
416 |
|
@@ -569,7 +501,6 @@ def create_world_map(metric):
|
|
569 |
scores.append(weighted_avg)
|
570 |
hover_texts.append(hover_text)
|
571 |
|
572 |
-
# Create the choropleth map
|
573 |
fig = go.Figure(
|
574 |
data=go.Choropleth(
|
575 |
locations=countries,
|
@@ -616,11 +547,21 @@ def create_world_map(metric):
|
|
616 |
return fig
|
617 |
|
618 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
619 |
def create_metric_explanation(metric):
|
620 |
return gr.Markdown(metric["explanation"], container=True)
|
621 |
|
622 |
|
623 |
-
|
624 |
# Create the visualization components
|
625 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
626 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
@@ -639,12 +580,6 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
639 |
with gr.Row():
|
640 |
with gr.Column():
|
641 |
with gr.Accordion("Model Filters", open=False):
|
642 |
-
model_type = gr.Radio(
|
643 |
-
choices=["Text-to-Text", "Speech-to-Text"],
|
644 |
-
value="Text-to-Text",
|
645 |
-
label="Select Model Type",
|
646 |
-
interactive=True,
|
647 |
-
)
|
648 |
model_licenses = gr.CheckboxGroup(
|
649 |
choices=["open source", "commercial"],
|
650 |
value=["open source", "commercial"],
|
@@ -667,26 +602,6 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
667 |
label="Select Unit of Analysis",
|
668 |
interactive=True,
|
669 |
)
|
670 |
-
region_filter = gr.CheckboxGroup(
|
671 |
-
choices=[
|
672 |
-
"Africa",
|
673 |
-
"Asia",
|
674 |
-
"Europe",
|
675 |
-
"North America",
|
676 |
-
"South America",
|
677 |
-
"Oceania",
|
678 |
-
],
|
679 |
-
value=[
|
680 |
-
"Africa",
|
681 |
-
"Asia",
|
682 |
-
"Europe",
|
683 |
-
"North America",
|
684 |
-
"South America",
|
685 |
-
"Oceania",
|
686 |
-
],
|
687 |
-
label="Filter by Region",
|
688 |
-
interactive=True,
|
689 |
-
)
|
690 |
family_filter = gr.CheckboxGroup(
|
691 |
choices=[
|
692 |
"Indo-European",
|
@@ -717,19 +632,27 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
717 |
interactive=True,
|
718 |
)
|
719 |
with gr.Row():
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
727 |
|
728 |
metric_explanation = create_metric_explanation(start_metric)
|
729 |
|
730 |
gr.Markdown("## Model Comparison")
|
731 |
-
create_leaderboard_df(start_metric)
|
732 |
-
|
733 |
model_comparison_plot = gr.Plot(
|
734 |
value=create_model_comparison_plot(start_metric),
|
735 |
label="Model Comparison",
|
@@ -748,34 +671,47 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
748 |
elem_classes="fullwidth-plot",
|
749 |
)
|
750 |
|
751 |
-
def
|
752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
753 |
return fn(metric)
|
754 |
|
755 |
-
|
756 |
-
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
772 |
-
|
|
|
|
|
773 |
|
774 |
with gr.Accordion("Methodology", open=False):
|
775 |
gr.Markdown(
|
776 |
"""
|
777 |
-
## Methodology
|
778 |
-
|
779 |
### Benchmark Data
|
780 |
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
781 |
|
@@ -804,8 +740,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
804 |
- Evaluate predictions using ChrF score against the original text
|
805 |
|
806 |
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
807 |
-
"""
|
808 |
-
container=True,
|
809 |
)
|
810 |
|
811 |
demo.launch()
|
|
|
87 |
""",
|
88 |
},
|
89 |
{
|
90 |
+
"display_name": "Automatic Speech Recognition (ChrF)",
|
91 |
"field_name": "asr_chrf",
|
92 |
"label": "ChrF",
|
93 |
"explanation": """
|
|
|
104 |
return sum(lst) / len(lst)
|
105 |
|
106 |
|
107 |
+
def create_leaderboard_df(model_type, metric=None):
|
108 |
+
metric = metric or METRICS[model_type][0]
|
109 |
+
_model_type = {"t2t": "text-to-text", "s2t": "speech-to-text"}[model_type]
|
110 |
+
models = {
|
111 |
+
score["model"]
|
112 |
+
for lang in languages_with_scores
|
113 |
+
for score in lang["scores"]
|
114 |
+
if score["model_type"] == _model_type
|
115 |
+
}
|
116 |
+
model_scores = [
|
117 |
+
{"model": score["model"], metric["field_name"]: score[metric["field_name"]]}
|
118 |
+
for lang in languages_with_scores
|
119 |
+
for score in lang["scores"]
|
120 |
+
for model in models
|
121 |
+
if score["model"] == model
|
122 |
]
|
123 |
+
df = (
|
124 |
+
pd.DataFrame(model_scores)
|
125 |
+
.groupby("model")
|
126 |
+
.agg({metric["field_name"]: ["mean", "count"]})
|
127 |
+
.reset_index()
|
128 |
)
|
129 |
+
# Flatten the multi-level column names
|
130 |
+
df.columns = df.columns.map(
|
131 |
+
lambda x: f"{x[0]}_{x[1]}" if isinstance(x, tuple) else x
|
132 |
+
)
|
133 |
+
df = df.rename(
|
134 |
+
columns={
|
135 |
+
f"{metric['field_name']}_mean": metric["label"],
|
136 |
+
f"{metric['field_name']}_count": "Languages Tested",
|
137 |
+
"model_": "Model",
|
138 |
+
}
|
139 |
+
)
|
140 |
+
df = df.sort_values(metric["label"], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
df["Rank"] = range(1, len(df) + 1)
|
142 |
df["Rank"] = df["Rank"].apply(
|
143 |
lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
|
144 |
)
|
145 |
+
df = df[["Rank", "Model", metric["label"], "Languages Tested"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
return gr.DataFrame(
|
147 |
value=df,
|
148 |
label="Model Leaderboard",
|
149 |
show_search=False,
|
150 |
+
datatype=["number", "markdown", "number", "number"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
)
|
152 |
|
153 |
|
|
|
219 |
else "N/A"
|
220 |
)
|
221 |
commonvoice_link = (
|
222 |
+
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {round(lang['commonvoice_hours'])}h</a>"
|
223 |
if lang["commonvoice_hours"]
|
224 |
else "N/A"
|
225 |
)
|
|
|
230 |
# "Overall": round(lang["overall_score"], 3)
|
231 |
# if lang["overall_score"] is not None
|
232 |
# else "N/A",
|
233 |
+
"Best Model": model_link,
|
234 |
+
"MT": round(lang["mt_chrf"], 3)
|
235 |
+
if lang["mt_chrf"] is not None
|
236 |
else "N/A",
|
237 |
+
"CLS": round(lang["cls_acc"], 3)
|
238 |
if lang["cls_acc"] is not None
|
239 |
else "N/A",
|
240 |
"MLM": round(lang["mlm_chrf"], 3)
|
241 |
if lang["mlm_chrf"] is not None
|
242 |
else "N/A",
|
243 |
+
"ASR": round(lang["asr_chrf"], 3) if lang["asr_wer"] is not None else "N/A",
|
244 |
+
"Common Voice": commonvoice_link,
|
|
|
245 |
}
|
246 |
flat_data.append(row)
|
247 |
|
|
|
254 |
column_widths=[
|
255 |
"100px",
|
256 |
"100px",
|
257 |
+
# "100px",
|
258 |
+
# "100px",
|
259 |
+
"200px", # Best Model
|
260 |
+
"100px", # MT
|
261 |
+
"100px", # CLS
|
262 |
+
"100px", # MLM
|
263 |
+
"100px", # ASR
|
264 |
+
"100px", # Common Voice
|
265 |
],
|
266 |
datatype=[
|
267 |
"markdown", # Language
|
268 |
"number", # Speakers
|
269 |
# "number", # Models Tested
|
270 |
+
# "number", # Overall
|
271 |
+
"markdown", # Best Model
|
272 |
"number", # Translation
|
273 |
"number", # Classification
|
274 |
"number", # MLM
|
275 |
"number", # ASR
|
|
|
276 |
"markdown", # CommonVoice Hours
|
277 |
],
|
278 |
)
|
279 |
|
280 |
|
281 |
def create_scatter_plot(metric):
|
|
|
|
|
|
|
|
|
|
|
282 |
# Create a list to store data for the scatter plot
|
283 |
scatter_data = []
|
284 |
+
for lang in languages_with_scores:
|
285 |
+
if lang["speakers"] < 10_000:
|
286 |
+
continue
|
287 |
# Calculate average score for this metric across all models
|
288 |
scores = [
|
289 |
score[metric["field_name"]]
|
|
|
297 |
"language": lang["language_name"],
|
298 |
"speakers": lang["speakers"],
|
299 |
"score": avg_score,
|
300 |
+
"family": lang["language_family"],
|
301 |
}
|
302 |
)
|
303 |
|
304 |
fig = go.Figure()
|
305 |
+
x_vals = [data["speakers"] / 1_000_000 for data in scatter_data]
|
|
|
|
|
|
|
|
|
306 |
y_vals = [data["score"] for data in scatter_data]
|
307 |
+
s_vals = [data["speakers"] / 20_000_000 for data in scatter_data]
|
308 |
+
color_pallette = [
|
309 |
+
"LightSkyBlue",
|
310 |
+
"LightGreen",
|
311 |
+
"LightCoral",
|
312 |
+
"LightPink",
|
313 |
+
"LightGoldenRodYellow",
|
314 |
+
"LightGray",
|
315 |
+
"LightSalmon",
|
316 |
+
"LightSeaGreen",
|
317 |
+
]
|
318 |
+
color_mapping = {
|
319 |
+
family: color
|
320 |
+
for family, color in zip(
|
321 |
+
sorted(set(data["family"] for data in scatter_data)), color_pallette
|
322 |
+
)
|
323 |
+
}
|
324 |
+
c_vals = [color_mapping[data["family"]] for data in scatter_data]
|
325 |
labels = [data["language"] for data in scatter_data]
|
|
|
|
|
326 |
hover_template = f"<b>%{{text}}</b><br>Speakers: %{{x:.1f}}M<br>{metric['label']}: %{{y:.3f}}<extra></extra>"
|
|
|
327 |
fig.add_trace(
|
328 |
go.Scatter(
|
329 |
x=x_vals,
|
330 |
y=y_vals,
|
331 |
+
marker=dict(size=s_vals, color=c_vals),
|
332 |
mode="markers+text",
|
333 |
text=labels,
|
334 |
textposition="top center",
|
335 |
hovertemplate=hover_template,
|
336 |
)
|
337 |
)
|
|
|
338 |
fig.update_layout(
|
339 |
title=None,
|
340 |
xaxis_title="Number of Speakers (Millions)",
|
|
|
342 |
height=500,
|
343 |
showlegend=False,
|
344 |
)
|
|
|
|
|
345 |
fig.update_xaxes(type="log")
|
|
|
346 |
return fig
|
347 |
|
348 |
|
|
|
501 |
scores.append(weighted_avg)
|
502 |
hover_texts.append(hover_text)
|
503 |
|
|
|
504 |
fig = go.Figure(
|
505 |
data=go.Choropleth(
|
506 |
locations=countries,
|
|
|
547 |
return fig
|
548 |
|
549 |
|
550 |
+
def create_metric_selector(model_type):
|
551 |
+
match model_type:
|
552 |
+
case "t2t":
|
553 |
+
choices = [m["display_name"] for m in METRICS["t2t"]]
|
554 |
+
case "s2t":
|
555 |
+
choices = [m["display_name"] for m in METRICS["s2t"]]
|
556 |
+
return gr.Dropdown(
|
557 |
+
choices=choices, value=choices[0], label="Select Metric", interactive=True
|
558 |
+
)
|
559 |
+
|
560 |
+
|
561 |
def create_metric_explanation(metric):
|
562 |
return gr.Markdown(metric["explanation"], container=True)
|
563 |
|
564 |
|
|
|
565 |
# Create the visualization components
|
566 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
567 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
|
|
580 |
with gr.Row():
|
581 |
with gr.Column():
|
582 |
with gr.Accordion("Model Filters", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
model_licenses = gr.CheckboxGroup(
|
584 |
choices=["open source", "commercial"],
|
585 |
value=["open source", "commercial"],
|
|
|
602 |
label="Select Unit of Analysis",
|
603 |
interactive=True,
|
604 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
605 |
family_filter = gr.CheckboxGroup(
|
606 |
choices=[
|
607 |
"Indo-European",
|
|
|
632 |
interactive=True,
|
633 |
)
|
634 |
with gr.Row():
|
635 |
+
with gr.Column():
|
636 |
+
start_model_type = "Text-to-Text"
|
637 |
+
model_type = gr.Radio(
|
638 |
+
choices=["Text-to-Text", "Speech-to-Text"],
|
639 |
+
value=start_model_type,
|
640 |
+
label="Select Model Type",
|
641 |
+
interactive=True,
|
642 |
+
)
|
643 |
+
start_metric = METRICS["t2t"][0]
|
644 |
+
metric = gr.Dropdown(
|
645 |
+
choices=[metric["display_name"] for metric in METRICS["t2t"]],
|
646 |
+
value=start_metric["display_name"],
|
647 |
+
label="Main task and metric to display in figures and map",
|
648 |
+
interactive=True,
|
649 |
+
)
|
650 |
|
651 |
metric_explanation = create_metric_explanation(start_metric)
|
652 |
|
653 |
gr.Markdown("## Model Comparison")
|
654 |
+
leaderboard_df = create_leaderboard_df("t2t", start_metric)
|
655 |
+
|
656 |
model_comparison_plot = gr.Plot(
|
657 |
value=create_model_comparison_plot(start_metric),
|
658 |
label="Model Comparison",
|
|
|
671 |
elem_classes="fullwidth-plot",
|
672 |
)
|
673 |
|
674 |
+
def update_model_type(model_type_choice):
|
675 |
+
model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice]
|
676 |
+
return create_metric_selector(model_type), create_leaderboard_df(model_type)
|
677 |
+
|
678 |
+
model_type.change(
|
679 |
+
fn=update_model_type,
|
680 |
+
inputs=model_type,
|
681 |
+
outputs=[metric, leaderboard_df],
|
682 |
+
)
|
683 |
+
|
684 |
+
def update_component(fn, model_type_choice, metric_choice):
|
685 |
+
model_type = {"Text-to-Text": "t2t", "Speech-to-Text": "s2t"}[model_type_choice]
|
686 |
+
metric = [m for m in METRICS[model_type] if m["display_name"] == metric_choice][
|
687 |
+
0
|
688 |
+
]
|
689 |
return fn(metric)
|
690 |
|
691 |
+
metric.change(
|
692 |
+
fn=partial(update_component, create_metric_explanation),
|
693 |
+
inputs=[model_type, metric],
|
694 |
+
outputs=metric_explanation,
|
695 |
+
)
|
696 |
+
metric.change(
|
697 |
+
fn=partial(update_component, create_model_comparison_plot),
|
698 |
+
inputs=[model_type, metric],
|
699 |
+
outputs=model_comparison_plot,
|
700 |
+
)
|
701 |
+
metric.change(
|
702 |
+
fn=partial(update_component, create_scatter_plot),
|
703 |
+
inputs=[model_type, metric],
|
704 |
+
outputs=scatter_plot,
|
705 |
+
)
|
706 |
+
metric.change(
|
707 |
+
fn=partial(update_component, create_world_map),
|
708 |
+
inputs=[model_type, metric],
|
709 |
+
outputs=world_map,
|
710 |
+
)
|
711 |
|
712 |
with gr.Accordion("Methodology", open=False):
|
713 |
gr.Markdown(
|
714 |
"""
|
|
|
|
|
715 |
### Benchmark Data
|
716 |
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
717 |
|
|
|
740 |
- Evaluate predictions using ChrF score against the original text
|
741 |
|
742 |
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
743 |
+
"""
|
|
|
744 |
)
|
745 |
|
746 |
demo.launch()
|
evals.py
CHANGED
@@ -93,11 +93,15 @@ def population(bcp_47):
|
|
93 |
}
|
94 |
return items
|
95 |
|
96 |
-
|
|
|
|
|
|
|
97 |
glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
|
98 |
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
99 |
)
|
100 |
|
|
|
101 |
@cache
|
102 |
def language_family(bcp_47):
|
103 |
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
@@ -106,6 +110,7 @@ def language_family(bcp_47):
|
|
106 |
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
107 |
return family["name"]
|
108 |
|
|
|
109 |
def script_name(iso15924):
|
110 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
111 |
|
@@ -255,17 +260,20 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
255 |
max_tokens=1024,
|
256 |
)
|
257 |
prediction = reply.choices[0].message.content.strip()
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
|
|
|
|
263 |
chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
|
264 |
return {
|
265 |
"model": model,
|
266 |
"bcp_47": original_language["bcp_47"],
|
267 |
"mt_bleu": bleu_score["bleu"],
|
268 |
-
"mt_chrf": chrf_score["score"],
|
269 |
"sentence_nr": sentence_nr,
|
270 |
}
|
271 |
|
@@ -371,7 +379,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
371 |
return {
|
372 |
"model": model,
|
373 |
"bcp_47": language["bcp_47"],
|
374 |
-
"mlm_chrf": chrf_score["score"],
|
375 |
"sentence_nr": nr,
|
376 |
}
|
377 |
|
@@ -432,7 +440,7 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
432 |
"model": model,
|
433 |
"bcp_47": language["bcp_47"],
|
434 |
"asr_wer": wer_score,
|
435 |
-
"asr_chrf": chrf_score["score"],
|
436 |
"sentence_nr": nr,
|
437 |
}
|
438 |
|
@@ -522,7 +530,7 @@ async def main():
|
|
522 |
mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
|
523 |
cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
|
524 |
mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
|
525 |
-
t2t_score = (mt_chrf
|
526 |
results.append(
|
527 |
{
|
528 |
"model": model,
|
@@ -577,9 +585,7 @@ async def main():
|
|
577 |
"t2t_score",
|
578 |
"s2t_score",
|
579 |
]:
|
580 |
-
language_results[score] = mean(
|
581 |
-
[s[score] for s in results if score in s]
|
582 |
-
)
|
583 |
all_results.append(language_results)
|
584 |
with open("results.json", "w") as f:
|
585 |
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
|
|
93 |
}
|
94 |
return items
|
95 |
|
96 |
+
|
97 |
+
glottolog = pd.read_csv(
|
98 |
+
"data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
|
99 |
+
) # Min _Nan_ Chinese is not N/A!
|
100 |
glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
|
101 |
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
102 |
)
|
103 |
|
104 |
+
|
105 |
@cache
|
106 |
def language_family(bcp_47):
|
107 |
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
|
|
110 |
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
111 |
return family["name"]
|
112 |
|
113 |
+
|
114 |
def script_name(iso15924):
|
115 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
116 |
|
|
|
260 |
max_tokens=1024,
|
261 |
)
|
262 |
prediction = reply.choices[0].message.content.strip()
|
263 |
+
if prediction.strip():
|
264 |
+
bleu_score = bleu.compute(
|
265 |
+
predictions=[prediction],
|
266 |
+
references=[target_sentence],
|
267 |
+
tokenizer=tokenizer.tokenize,
|
268 |
+
)
|
269 |
+
else:
|
270 |
+
bleu_score = {"bleu": 0}
|
271 |
chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
|
272 |
return {
|
273 |
"model": model,
|
274 |
"bcp_47": original_language["bcp_47"],
|
275 |
"mt_bleu": bleu_score["bleu"],
|
276 |
+
"mt_chrf": chrf_score["score"] / 100,
|
277 |
"sentence_nr": sentence_nr,
|
278 |
}
|
279 |
|
|
|
379 |
return {
|
380 |
"model": model,
|
381 |
"bcp_47": language["bcp_47"],
|
382 |
+
"mlm_chrf": chrf_score["score"] / 100,
|
383 |
"sentence_nr": nr,
|
384 |
}
|
385 |
|
|
|
440 |
"model": model,
|
441 |
"bcp_47": language["bcp_47"],
|
442 |
"asr_wer": wer_score,
|
443 |
+
"asr_chrf": chrf_score["score"] / 100,
|
444 |
"sentence_nr": nr,
|
445 |
}
|
446 |
|
|
|
530 |
mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
|
531 |
cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
|
532 |
mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
|
533 |
+
t2t_score = (mt_chrf + cls_acc + mlm_chrf) / 3
|
534 |
results.append(
|
535 |
{
|
536 |
"model": model,
|
|
|
585 |
"t2t_score",
|
586 |
"s2t_score",
|
587 |
]:
|
588 |
+
language_results[score] = mean([s[score] for s in results if score in s])
|
|
|
|
|
589 |
all_results.append(language_results)
|
590 |
with open("results.json", "w") as f:
|
591 |
json.dump(all_results, f, indent=2, ensure_ascii=False)
|
uv.lock
CHANGED
@@ -1000,7 +1000,7 @@ dev = [
|
|
1000 |
[package.metadata]
|
1001 |
requires-dist = [
|
1002 |
{ name = "gradio", specifier = ">=5.16.2" },
|
1003 |
-
{ name = "gradio-rangeslider" },
|
1004 |
{ name = "language-data", specifier = ">=1.3.0" },
|
1005 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1006 |
{ name = "plotly", specifier = ">=6.0.0" },
|
|
|
1000 |
[package.metadata]
|
1001 |
requires-dist = [
|
1002 |
{ name = "gradio", specifier = ">=5.16.2" },
|
1003 |
+
{ name = "gradio-rangeslider", specifier = ">=0.0.8" },
|
1004 |
{ name = "language-data", specifier = ">=1.3.0" },
|
1005 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1006 |
{ name = "plotly", specifier = ">=6.0.0" },
|