David Pomerenke
commited on
Commit
·
e9a19be
1
Parent(s):
040dc35
Separate overall scores for T2T / S2T
Browse files- app.py +60 -37
- evals.py +5 -4
- results.json +0 -0
app.py
CHANGED
@@ -8,20 +8,24 @@ import plotly.graph_objects as go
|
|
8 |
import pycountry
|
9 |
|
10 |
with open("results.json") as f:
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
|
13 |
# Global constants for metric mappings
|
14 |
-
METRICS =
|
15 |
-
|
16 |
-
"display_name": "Overall Performance",
|
17 |
-
"field_name": "
|
18 |
-
"label": "Overall
|
19 |
"explanation": """
|
20 |
-
**Overall Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
21 |
Higher scores indicate better overall language capabilities.
|
22 |
""",
|
23 |
},
|
24 |
-
|
25 |
"display_name": "Translation (BLEU)",
|
26 |
"field_name": "mt_bleu",
|
27 |
"label": "BLEU Score",
|
@@ -30,7 +34,7 @@ METRICS = {
|
|
30 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
31 |
""",
|
32 |
},
|
33 |
-
|
34 |
"display_name": "Translation (ChrF)",
|
35 |
"field_name": "mt_chrf",
|
36 |
"label": "ChrF Score",
|
@@ -40,7 +44,7 @@ METRICS = {
|
|
40 |
Higher scores (0-1) indicate better translations.
|
41 |
""",
|
42 |
},
|
43 |
-
|
44 |
"display_name": "Classification (Accuracy)",
|
45 |
"field_name": "cls_acc",
|
46 |
"label": "Classification Accuracy",
|
@@ -50,7 +54,7 @@ METRICS = {
|
|
50 |
Reported as a percentage where higher values indicate better classification performance.
|
51 |
""",
|
52 |
},
|
53 |
-
|
54 |
"display_name": "Masked Language Modeling (ChrF)",
|
55 |
"field_name": "mlm_chrf",
|
56 |
"label": "MLM ChrF Score",
|
@@ -60,7 +64,16 @@ METRICS = {
|
|
60 |
between predicted and actual text. Higher scores indicate better language understanding.
|
61 |
""",
|
62 |
},
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
"display_name": "Automatic Speech Recognition (WER)",
|
65 |
"field_name": "asr_wer",
|
66 |
"label": "WER",
|
@@ -71,7 +84,7 @@ METRICS = {
|
|
71 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
72 |
""",
|
73 |
},
|
74 |
-
|
75 |
"display_name": "Automatic Speech Recognition ChrF",
|
76 |
"field_name": "asr_chrf",
|
77 |
"label": "ChrF",
|
@@ -80,8 +93,8 @@ METRICS = {
|
|
80 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
81 |
Higher scores (0-1) indicate better translations.
|
82 |
""",
|
83 |
-
},
|
84 |
-
|
85 |
|
86 |
|
87 |
def mean(lst):
|
@@ -91,7 +104,7 @@ def mean(lst):
|
|
91 |
def create_leaderboard_df(metric):
|
92 |
# Sort languages by average BLEU to determine resource categories
|
93 |
langs_with_score = [
|
94 |
-
lang for lang in
|
95 |
]
|
96 |
sorted_langs = sorted(
|
97 |
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
|
@@ -106,7 +119,7 @@ def create_leaderboard_df(metric):
|
|
106 |
|
107 |
# Get all model scores with categorization
|
108 |
model_scores = {}
|
109 |
-
for lang in
|
110 |
category = (
|
111 |
"High-Resource"
|
112 |
if lang["language_name"] in high_resource
|
@@ -205,7 +218,7 @@ def create_leaderboard_df(metric):
|
|
205 |
|
206 |
|
207 |
def create_model_comparison_plot(metric):
|
208 |
-
top_languages = sorted(
|
209 |
|
210 |
# Create appropriate title and y-axis label based on metric
|
211 |
title = f"{metric['display_name']} by Model and Language"
|
@@ -251,14 +264,14 @@ def create_language_stats_df(metric):
|
|
251 |
# Create a list to store flattened data
|
252 |
flat_data = []
|
253 |
|
254 |
-
for lang in
|
255 |
# Find the best model and its BLEU score
|
256 |
best_model = max(
|
257 |
-
lang["scores"] or [{"
|
258 |
-
key=lambda x: x
|
259 |
-
)
|
260 |
|
261 |
-
model = best_model["model"]
|
262 |
model_name = model.split("/")[-1] if model else "N/A"
|
263 |
model_link = (
|
264 |
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
|
@@ -274,9 +287,9 @@ def create_language_stats_df(metric):
|
|
274 |
"Language": f"**{lang['language_name']}**",
|
275 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
276 |
# "Models Tested": len(lang["scores"]),
|
277 |
-
"Overall": round(lang["overall_score"], 3)
|
278 |
-
if lang["overall_score"] is not None
|
279 |
-
else "N/A",
|
280 |
"Translation": round(lang["mt_bleu"], 3)
|
281 |
if lang["mt_bleu"] is not None
|
282 |
else "N/A",
|
@@ -286,9 +299,7 @@ def create_language_stats_df(metric):
|
|
286 |
"MLM": round(lang["mlm_chrf"], 3)
|
287 |
if lang["mlm_chrf"] is not None
|
288 |
else "N/A",
|
289 |
-
"ASR": round(lang["asr_wer"], 3)
|
290 |
-
if lang["asr_wer"] is not None
|
291 |
-
else "N/A",
|
292 |
"Best Model": model_link,
|
293 |
"CommonVoice Hours": commonvoice_link,
|
294 |
}
|
@@ -296,9 +307,22 @@ def create_language_stats_df(metric):
|
|
296 |
|
297 |
df = pd.DataFrame(flat_data)
|
298 |
return gr.DataFrame(
|
299 |
-
value=df,
|
300 |
label="Language Results",
|
301 |
show_search="search",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
302 |
datatype=[
|
303 |
"markdown", # Language
|
304 |
"number", # Speakers
|
@@ -316,7 +340,7 @@ def create_language_stats_df(metric):
|
|
316 |
|
317 |
def create_scatter_plot(metric):
|
318 |
# Filter results to include only languages with sufficient speakers
|
319 |
-
filtered_results = [lang for lang in
|
320 |
|
321 |
# Create a list to store data for the scatter plot
|
322 |
scatter_data = []
|
@@ -434,7 +458,7 @@ def create_world_map(metric):
|
|
434 |
# Collect all country data
|
435 |
population_data = get_population_data()
|
436 |
country_data = {}
|
437 |
-
for lang in
|
438 |
# Skip languages without the required data
|
439 |
if "population" not in lang or lang[metric["field_name"]] is None:
|
440 |
continue
|
@@ -585,10 +609,10 @@ def create_metric_explanation(metric):
|
|
585 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
586 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
587 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
588 |
-
start_metric = METRICS[
|
589 |
|
590 |
metric = gr.Dropdown(
|
591 |
-
choices=[metric_info["display_name"] for metric_info in METRICS
|
592 |
value=start_metric["display_name"],
|
593 |
label="Select Metric",
|
594 |
interactive=True,
|
@@ -596,7 +620,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
596 |
metric_explanation = create_metric_explanation(start_metric)
|
597 |
|
598 |
gr.Markdown("## Model Comparison")
|
599 |
-
create_leaderboard_df(start_metric)
|
600 |
model_comparison_plot = gr.Plot(
|
601 |
value=create_model_comparison_plot(start_metric),
|
602 |
label="Model Comparison",
|
@@ -652,10 +676,9 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
652 |
)
|
653 |
|
654 |
def update_component(fn, metric_choice):
|
655 |
-
metric = [m for m in METRICS
|
656 |
return fn(metric)
|
657 |
|
658 |
-
|
659 |
metric.change(
|
660 |
fn=partial(update_component, create_metric_explanation),
|
661 |
inputs=metric,
|
|
|
8 |
import pycountry
|
9 |
|
10 |
with open("results.json") as f:
|
11 |
+
languages = json.load(f)
|
12 |
+
|
13 |
+
languages_with_scores = [
|
14 |
+
lang for lang in languages if lang["t2t_score"] is not None
|
15 |
+
]
|
16 |
|
17 |
# Global constants for metric mappings
|
18 |
+
METRICS = [
|
19 |
+
{
|
20 |
+
"display_name": "Overall Text-to-Text Performance",
|
21 |
+
"field_name": "t2t_score",
|
22 |
+
"label": "Overall Score",
|
23 |
"explanation": """
|
24 |
+
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
25 |
Higher scores indicate better overall language capabilities.
|
26 |
""",
|
27 |
},
|
28 |
+
{
|
29 |
"display_name": "Translation (BLEU)",
|
30 |
"field_name": "mt_bleu",
|
31 |
"label": "BLEU Score",
|
|
|
34 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
35 |
""",
|
36 |
},
|
37 |
+
{
|
38 |
"display_name": "Translation (ChrF)",
|
39 |
"field_name": "mt_chrf",
|
40 |
"label": "ChrF Score",
|
|
|
44 |
Higher scores (0-1) indicate better translations.
|
45 |
""",
|
46 |
},
|
47 |
+
{
|
48 |
"display_name": "Classification (Accuracy)",
|
49 |
"field_name": "cls_acc",
|
50 |
"label": "Classification Accuracy",
|
|
|
54 |
Reported as a percentage where higher values indicate better classification performance.
|
55 |
""",
|
56 |
},
|
57 |
+
{
|
58 |
"display_name": "Masked Language Modeling (ChrF)",
|
59 |
"field_name": "mlm_chrf",
|
60 |
"label": "MLM ChrF Score",
|
|
|
64 |
between predicted and actual text. Higher scores indicate better language understanding.
|
65 |
""",
|
66 |
},
|
67 |
+
{
|
68 |
+
"display_name": "Overall Speech-to-Text Performance",
|
69 |
+
"field_name": "s2t_score",
|
70 |
+
"label": "Overall Score",
|
71 |
+
"explanation": """
|
72 |
+
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
73 |
+
Higher scores indicate better overall language capabilities.
|
74 |
+
""",
|
75 |
+
},
|
76 |
+
{
|
77 |
"display_name": "Automatic Speech Recognition (WER)",
|
78 |
"field_name": "asr_wer",
|
79 |
"label": "WER",
|
|
|
84 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
85 |
""",
|
86 |
},
|
87 |
+
{
|
88 |
"display_name": "Automatic Speech Recognition ChrF",
|
89 |
"field_name": "asr_chrf",
|
90 |
"label": "ChrF",
|
|
|
93 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
94 |
Higher scores (0-1) indicate better translations.
|
95 |
""",
|
96 |
+
},
|
97 |
+
]
|
98 |
|
99 |
|
100 |
def mean(lst):
|
|
|
104 |
def create_leaderboard_df(metric):
|
105 |
# Sort languages by average BLEU to determine resource categories
|
106 |
langs_with_score = [
|
107 |
+
lang for lang in languages_with_scores if lang[metric["field_name"]] is not None
|
108 |
]
|
109 |
sorted_langs = sorted(
|
110 |
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
|
|
|
119 |
|
120 |
# Get all model scores with categorization
|
121 |
model_scores = {}
|
122 |
+
for lang in languages_with_scores:
|
123 |
category = (
|
124 |
"High-Resource"
|
125 |
if lang["language_name"] in high_resource
|
|
|
218 |
|
219 |
|
220 |
def create_model_comparison_plot(metric):
|
221 |
+
top_languages = sorted(languages_with_scores, key=lambda x: x["speakers"], reverse=True)[:10]
|
222 |
|
223 |
# Create appropriate title and y-axis label based on metric
|
224 |
title = f"{metric['display_name']} by Model and Language"
|
|
|
264 |
# Create a list to store flattened data
|
265 |
flat_data = []
|
266 |
|
267 |
+
for lang in languages:
|
268 |
# Find the best model and its BLEU score
|
269 |
best_model = max(
|
270 |
+
lang["scores"] or [{"t2t_score": None, "model": None}],
|
271 |
+
key=lambda x: x.get("t2t_score", 0),
|
272 |
+
) if lang["t2t_score"] is not None else None
|
273 |
|
274 |
+
model = best_model["model"] if best_model else None
|
275 |
model_name = model.split("/")[-1] if model else "N/A"
|
276 |
model_link = (
|
277 |
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
|
|
|
287 |
"Language": f"**{lang['language_name']}**",
|
288 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
289 |
# "Models Tested": len(lang["scores"]),
|
290 |
+
# "Overall": round(lang["overall_score"], 3)
|
291 |
+
# if lang["overall_score"] is not None
|
292 |
+
# else "N/A",
|
293 |
"Translation": round(lang["mt_bleu"], 3)
|
294 |
if lang["mt_bleu"] is not None
|
295 |
else "N/A",
|
|
|
299 |
"MLM": round(lang["mlm_chrf"], 3)
|
300 |
if lang["mlm_chrf"] is not None
|
301 |
else "N/A",
|
302 |
+
"ASR": round(lang["asr_wer"], 3) if lang["asr_wer"] is not None else "N/A",
|
|
|
|
|
303 |
"Best Model": model_link,
|
304 |
"CommonVoice Hours": commonvoice_link,
|
305 |
}
|
|
|
307 |
|
308 |
df = pd.DataFrame(flat_data)
|
309 |
return gr.DataFrame(
|
310 |
+
value=df,
|
311 |
label="Language Results",
|
312 |
show_search="search",
|
313 |
+
pinned_columns=1,
|
314 |
+
column_widths=[
|
315 |
+
"100px",
|
316 |
+
"100px",
|
317 |
+
"100px",
|
318 |
+
"100px",
|
319 |
+
"100px",
|
320 |
+
"100px",
|
321 |
+
"100px",
|
322 |
+
"100px",
|
323 |
+
"100px",
|
324 |
+
"100px",
|
325 |
+
],
|
326 |
datatype=[
|
327 |
"markdown", # Language
|
328 |
"number", # Speakers
|
|
|
340 |
|
341 |
def create_scatter_plot(metric):
|
342 |
# Filter results to include only languages with sufficient speakers
|
343 |
+
filtered_results = [lang for lang in languages_with_scores if lang["speakers"] >= 10_000]
|
344 |
|
345 |
# Create a list to store data for the scatter plot
|
346 |
scatter_data = []
|
|
|
458 |
# Collect all country data
|
459 |
population_data = get_population_data()
|
460 |
country_data = {}
|
461 |
+
for lang in languages:
|
462 |
# Skip languages without the required data
|
463 |
if "population" not in lang or lang[metric["field_name"]] is None:
|
464 |
continue
|
|
|
609 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
610 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
611 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
612 |
+
start_metric = METRICS[0]
|
613 |
|
614 |
metric = gr.Dropdown(
|
615 |
+
choices=[metric_info["display_name"] for metric_info in METRICS],
|
616 |
value=start_metric["display_name"],
|
617 |
label="Select Metric",
|
618 |
interactive=True,
|
|
|
620 |
metric_explanation = create_metric_explanation(start_metric)
|
621 |
|
622 |
gr.Markdown("## Model Comparison")
|
623 |
+
# create_leaderboard_df(start_metric)
|
624 |
model_comparison_plot = gr.Plot(
|
625 |
value=create_model_comparison_plot(start_metric),
|
626 |
label="Model Comparison",
|
|
|
676 |
)
|
677 |
|
678 |
def update_component(fn, metric_choice):
|
679 |
+
metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
|
680 |
return fn(metric)
|
681 |
|
|
|
682 |
metric.change(
|
683 |
fn=partial(update_component, create_metric_explanation),
|
684 |
inputs=metric,
|
evals.py
CHANGED
@@ -522,7 +522,7 @@ async def main():
|
|
522 |
mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
|
523 |
cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
|
524 |
mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
|
525 |
-
|
526 |
results.append(
|
527 |
{
|
528 |
"model": model,
|
@@ -531,7 +531,7 @@ async def main():
|
|
531 |
"mt_chrf": mt_chrf,
|
532 |
"cls_acc": cls_acc,
|
533 |
"mlm_chrf": mlm_chrf,
|
534 |
-
"
|
535 |
}
|
536 |
)
|
537 |
for model in transcription_models:
|
@@ -550,7 +550,7 @@ async def main():
|
|
550 |
"model_type": "speech-to-text",
|
551 |
"asr_wer": asr_wer,
|
552 |
"asr_chrf": asr_chrf,
|
553 |
-
"
|
554 |
}
|
555 |
)
|
556 |
language_results = {
|
@@ -574,7 +574,8 @@ async def main():
|
|
574 |
"mlm_chrf",
|
575 |
"asr_wer",
|
576 |
"asr_chrf",
|
577 |
-
"
|
|
|
578 |
]:
|
579 |
language_results[score] = mean(
|
580 |
[s[score] for s in results if score in s]
|
|
|
522 |
mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
|
523 |
cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
|
524 |
mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
|
525 |
+
t2t_score = (mt_chrf / 100 + cls_acc + mlm_chrf / 100) / 3
|
526 |
results.append(
|
527 |
{
|
528 |
"model": model,
|
|
|
531 |
"mt_chrf": mt_chrf,
|
532 |
"cls_acc": cls_acc,
|
533 |
"mlm_chrf": mlm_chrf,
|
534 |
+
"t2t_score": t2t_score,
|
535 |
}
|
536 |
)
|
537 |
for model in transcription_models:
|
|
|
550 |
"model_type": "speech-to-text",
|
551 |
"asr_wer": asr_wer,
|
552 |
"asr_chrf": asr_chrf,
|
553 |
+
"s2t_score": (asr_wer + asr_chrf) / 2,
|
554 |
}
|
555 |
)
|
556 |
language_results = {
|
|
|
574 |
"mlm_chrf",
|
575 |
"asr_wer",
|
576 |
"asr_chrf",
|
577 |
+
"t2t_score",
|
578 |
+
"s2t_score",
|
579 |
]:
|
580 |
language_results[score] = mean(
|
581 |
[s[score] for s in results if score in s]
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|