David Pomerenke
commited on
Commit
·
e32fd78
1
Parent(s):
e9a19be
Filter UI setup
Browse files- app.py +233 -123
- pyproject.toml +1 -0
- uv.lock +14 -0
app.py
CHANGED
@@ -6,95 +6,98 @@ import pandas as pd
|
|
6 |
import plotly.express as px
|
7 |
import plotly.graph_objects as go
|
8 |
import pycountry
|
|
|
9 |
|
10 |
with open("results.json") as f:
|
11 |
languages = json.load(f)
|
12 |
|
13 |
-
languages_with_scores = [
|
14 |
-
lang for lang in languages if lang["t2t_score"] is not None
|
15 |
-
]
|
16 |
|
17 |
# Global constants for metric mappings
|
18 |
-
METRICS =
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
24 |
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
25 |
Higher scores indicate better overall language capabilities.
|
26 |
""",
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
|
34 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
35 |
""",
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
43 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
44 |
Higher scores (0-1) indicate better translations.
|
45 |
""",
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
|
53 |
This evaluates a model's understanding of content and context across different languages.
|
54 |
Reported as a percentage where higher values indicate better classification performance.
|
55 |
""",
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
|
63 |
This tests a model's understanding of language structure and semantics by measuring the character-level similarity
|
64 |
between predicted and actual text. Higher scores indicate better language understanding.
|
65 |
""",
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
73 |
Higher scores indicate better overall language capabilities.
|
74 |
""",
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription.
|
82 |
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the
|
83 |
transcription into the reference text, divided by the number of words in the reference.
|
84 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
85 |
""",
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
**Automatic Speech Recognition ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
93 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
94 |
Higher scores (0-1) indicate better translations.
|
95 |
""",
|
96 |
-
|
97 |
-
]
|
|
|
98 |
|
99 |
|
100 |
def mean(lst):
|
@@ -136,7 +139,10 @@ def create_leaderboard_df(metric):
|
|
136 |
"Mid-Resource": [],
|
137 |
"Low-Resource": [],
|
138 |
}
|
139 |
-
|
|
|
|
|
|
|
140 |
|
141 |
# Calculate average scores and create DataFrame
|
142 |
leaderboard_data = []
|
@@ -164,7 +170,8 @@ def create_leaderboard_df(metric):
|
|
164 |
+ categories["Mid-Resource"]
|
165 |
+ categories["Low-Resource"]
|
166 |
)
|
167 |
-
|
|
|
168 |
|
169 |
model_name = model.split("/")[-1]
|
170 |
leaderboard_data.append(
|
@@ -218,7 +225,9 @@ def create_leaderboard_df(metric):
|
|
218 |
|
219 |
|
220 |
def create_model_comparison_plot(metric):
|
221 |
-
top_languages = sorted(
|
|
|
|
|
222 |
|
223 |
# Create appropriate title and y-axis label based on metric
|
224 |
title = f"{metric['display_name']} by Model and Language"
|
@@ -266,10 +275,14 @@ def create_language_stats_df(metric):
|
|
266 |
|
267 |
for lang in languages:
|
268 |
# Find the best model and its BLEU score
|
269 |
-
best_model =
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
273 |
|
274 |
model = best_model["model"] if best_model else None
|
275 |
model_name = model.split("/")[-1] if model else "N/A"
|
@@ -340,7 +353,9 @@ def create_language_stats_df(metric):
|
|
340 |
|
341 |
def create_scatter_plot(metric):
|
342 |
# Filter results to include only languages with sufficient speakers
|
343 |
-
filtered_results = [
|
|
|
|
|
344 |
|
345 |
# Create a list to store data for the scatter plot
|
346 |
scatter_data = []
|
@@ -602,25 +617,119 @@ def create_world_map(metric):
|
|
602 |
|
603 |
|
604 |
def create_metric_explanation(metric):
|
605 |
-
return gr.Markdown(metric["explanation"])
|
|
|
606 |
|
607 |
|
608 |
# Create the visualization components
|
609 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
610 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
611 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
612 |
-
start_metric = METRICS[0]
|
613 |
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
|
|
|
|
|
|
|
|
618 |
interactive=True,
|
619 |
)
|
620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
621 |
|
622 |
gr.Markdown("## Model Comparison")
|
623 |
-
|
|
|
624 |
model_comparison_plot = gr.Plot(
|
625 |
value=create_model_comparison_plot(start_metric),
|
626 |
label="Model Comparison",
|
@@ -639,63 +748,64 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
639 |
elem_classes="fullwidth-plot",
|
640 |
)
|
641 |
|
642 |
-
gr.Markdown(
|
643 |
-
"""
|
644 |
-
## Methodology
|
645 |
-
|
646 |
-
### Benchmark Data
|
647 |
-
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
648 |
-
|
649 |
-
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
|
650 |
-
|
651 |
-
### AI Models
|
652 |
-
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
|
653 |
-
|
654 |
-
### Evaluation Tasks
|
655 |
-
Our benchmark includes three core tasks to assess different aspects of language understanding:
|
656 |
-
|
657 |
-
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
|
658 |
-
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
|
659 |
-
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
|
660 |
-
|
661 |
-
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
|
662 |
-
- Group sentences by URL into paragraphs with the same topic
|
663 |
-
- Use the 5 most common topics, encoded as numbers rather than English labels
|
664 |
-
- Provide 5 examples of each topic as few-shot examples
|
665 |
-
- Test the model's ability to classify new text
|
666 |
-
- Report accuracy as the primary metric
|
667 |
-
|
668 |
-
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
|
669 |
-
- Mask approximately 5% of each sentence at a random position
|
670 |
-
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
|
671 |
-
- Evaluate predictions using ChrF score against the original text
|
672 |
-
|
673 |
-
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
674 |
-
""",
|
675 |
-
container=True,
|
676 |
-
)
|
677 |
-
|
678 |
def update_component(fn, metric_choice):
|
679 |
metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
|
680 |
return fn(metric)
|
681 |
|
682 |
-
metric.change(
|
683 |
-
|
684 |
-
|
685 |
-
|
686 |
-
)
|
687 |
-
metric.change(
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
)
|
692 |
-
metric.change(
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
)
|
697 |
-
metric.change(
|
698 |
-
|
699 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
700 |
|
701 |
demo.launch()
|
|
|
6 |
import plotly.express as px
|
7 |
import plotly.graph_objects as go
|
8 |
import pycountry
|
9 |
+
from gradio_rangeslider import RangeSlider
|
10 |
|
11 |
with open("results.json") as f:
|
12 |
languages = json.load(f)
|
13 |
|
14 |
+
languages_with_scores = [lang for lang in languages if lang["t2t_score"] is not None]
|
|
|
|
|
15 |
|
16 |
# Global constants for metric mappings
|
17 |
+
METRICS = {
|
18 |
+
"t2t": [
|
19 |
+
{
|
20 |
+
"display_name": "Overall Text-to-Text Performance",
|
21 |
+
"field_name": "t2t_score",
|
22 |
+
"label": "Overall Score",
|
23 |
+
"explanation": """
|
24 |
**Overall Score for Text-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
25 |
Higher scores indicate better overall language capabilities.
|
26 |
""",
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"display_name": "Translation (BLEU)",
|
30 |
+
"field_name": "mt_bleu",
|
31 |
+
"label": "BLEU Score",
|
32 |
+
"explanation": """
|
33 |
**Translation BLEU**: BiLingual Evaluation Understudy (BLEU) measures how similar AI-generated translations are to human reference translations.
|
34 |
It calculates n-gram precision and applies a brevity penalty. Scores range from 0 to 1, with higher values indicating better translation quality.
|
35 |
""",
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"display_name": "Translation (ChrF)",
|
39 |
+
"field_name": "mt_chrf",
|
40 |
+
"label": "ChrF Score",
|
41 |
+
"explanation": """
|
42 |
**Translation ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
43 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
44 |
Higher scores (0-1) indicate better translations.
|
45 |
""",
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"display_name": "Classification (Accuracy)",
|
49 |
+
"field_name": "cls_acc",
|
50 |
+
"label": "Classification Accuracy",
|
51 |
+
"explanation": """
|
52 |
**Classification Accuracy**: Measures how accurately models can classify text into predefined categories.
|
53 |
This evaluates a model's understanding of content and context across different languages.
|
54 |
Reported as a percentage where higher values indicate better classification performance.
|
55 |
""",
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"display_name": "Masked Language Modeling (ChrF)",
|
59 |
+
"field_name": "mlm_chrf",
|
60 |
+
"label": "MLM ChrF Score",
|
61 |
+
"explanation": """
|
62 |
**Masked Language Modeling ChrF**: Evaluates how well models can predict masked (hidden) portions of text.
|
63 |
This tests a model's understanding of language structure and semantics by measuring the character-level similarity
|
64 |
between predicted and actual text. Higher scores indicate better language understanding.
|
65 |
""",
|
66 |
+
},
|
67 |
+
],
|
68 |
+
"s2t": [
|
69 |
+
{
|
70 |
+
"display_name": "Overall Speech-to-Text Performance",
|
71 |
+
"field_name": "s2t_score",
|
72 |
+
"label": "Overall Score",
|
73 |
+
"explanation": """
|
74 |
**Overall Score for Speech-to-Text Performance**: A weighted combination of all metrics, providing a holistic view of model performance across different language tasks.
|
75 |
Higher scores indicate better overall language capabilities.
|
76 |
""",
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"display_name": "Automatic Speech Recognition (WER)",
|
80 |
+
"field_name": "asr_wer",
|
81 |
+
"label": "WER",
|
82 |
+
"explanation": """
|
83 |
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription.
|
84 |
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the
|
85 |
transcription into the reference text, divided by the number of words in the reference.
|
86 |
Lower scores indicate better performance, with 0 being perfect transcription.
|
87 |
""",
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"display_name": "Automatic Speech Recognition ChrF",
|
91 |
+
"field_name": "asr_chrf",
|
92 |
+
"label": "ChrF",
|
93 |
+
"explanation": """
|
94 |
**Automatic Speech Recognition ChrF**: Character n-gram F-score evaluates translations at the character level rather than word level.
|
95 |
This metric is particularly valuable for morphologically rich languages and can better capture partial word matches.
|
96 |
Higher scores (0-1) indicate better translations.
|
97 |
""",
|
98 |
+
},
|
99 |
+
],
|
100 |
+
}
|
101 |
|
102 |
|
103 |
def mean(lst):
|
|
|
139 |
"Mid-Resource": [],
|
140 |
"Low-Resource": [],
|
141 |
}
|
142 |
+
# Check if the metric field exists in the score dictionary before accessing it
|
143 |
+
if metric["field_name"] in score:
|
144 |
+
model_scores[model][category].append(score[metric["field_name"]])
|
145 |
+
# If the metric is missing, we'll skip this score
|
146 |
|
147 |
# Calculate average scores and create DataFrame
|
148 |
leaderboard_data = []
|
|
|
170 |
+ categories["Mid-Resource"]
|
171 |
+ categories["Low-Resource"]
|
172 |
)
|
173 |
+
# Check if all_scores is empty to avoid division by zero
|
174 |
+
overall_avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else 0
|
175 |
|
176 |
model_name = model.split("/")[-1]
|
177 |
leaderboard_data.append(
|
|
|
225 |
|
226 |
|
227 |
def create_model_comparison_plot(metric):
|
228 |
+
top_languages = sorted(
|
229 |
+
languages_with_scores, key=lambda x: x["speakers"], reverse=True
|
230 |
+
)[:10]
|
231 |
|
232 |
# Create appropriate title and y-axis label based on metric
|
233 |
title = f"{metric['display_name']} by Model and Language"
|
|
|
275 |
|
276 |
for lang in languages:
|
277 |
# Find the best model and its BLEU score
|
278 |
+
best_model = (
|
279 |
+
max(
|
280 |
+
lang["scores"] or [{"t2t_score": None, "model": None}],
|
281 |
+
key=lambda x: x.get("t2t_score", 0),
|
282 |
+
)
|
283 |
+
if lang["t2t_score"] is not None
|
284 |
+
else None
|
285 |
+
)
|
286 |
|
287 |
model = best_model["model"] if best_model else None
|
288 |
model_name = model.split("/")[-1] if model else "N/A"
|
|
|
353 |
|
354 |
def create_scatter_plot(metric):
|
355 |
# Filter results to include only languages with sufficient speakers
|
356 |
+
filtered_results = [
|
357 |
+
lang for lang in languages_with_scores if lang["speakers"] >= 10_000
|
358 |
+
]
|
359 |
|
360 |
# Create a list to store data for the scatter plot
|
361 |
scatter_data = []
|
|
|
617 |
|
618 |
|
619 |
def create_metric_explanation(metric):
|
620 |
+
return gr.Markdown(metric["explanation"], container=True)
|
621 |
+
|
622 |
|
623 |
|
624 |
# Create the visualization components
|
625 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
626 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
627 |
gr.Markdown("Comparing language proficiency across different models and languages.")
|
|
|
628 |
|
629 |
+
language_choices = [
|
630 |
+
f"{lang['language_name']} ({lang['bcp_47']})" for lang in languages
|
631 |
+
]
|
632 |
+
models = {score["model"] for lang in languages for score in lang["scores"]}
|
633 |
+
search = gr.Dropdown(
|
634 |
+
choices=list(models) + language_choices,
|
635 |
+
value=None,
|
636 |
+
label="Search for Language or Model",
|
637 |
interactive=True,
|
638 |
)
|
639 |
+
with gr.Row():
|
640 |
+
with gr.Column():
|
641 |
+
with gr.Accordion("Model Filters", open=False):
|
642 |
+
model_type = gr.Radio(
|
643 |
+
choices=["Text-to-Text", "Speech-to-Text"],
|
644 |
+
value="Text-to-Text",
|
645 |
+
label="Select Model Type",
|
646 |
+
interactive=True,
|
647 |
+
)
|
648 |
+
model_licenses = gr.CheckboxGroup(
|
649 |
+
choices=["open source", "commercial"],
|
650 |
+
value=["open source", "commercial"],
|
651 |
+
label="Filter by Model License",
|
652 |
+
interactive=True,
|
653 |
+
)
|
654 |
+
model_sizes = RangeSlider(
|
655 |
+
minimum=0,
|
656 |
+
maximum=1000,
|
657 |
+
value=(0, 1000),
|
658 |
+
label="Filter by Model Size (in Billion Parameters)",
|
659 |
+
interactive=True,
|
660 |
+
)
|
661 |
+
|
662 |
+
with gr.Column():
|
663 |
+
with gr.Accordion("Language Filters", open=False):
|
664 |
+
unit_of_analysis = gr.Radio(
|
665 |
+
choices=["Languages", "Language Families", "Regions"],
|
666 |
+
value="Languages",
|
667 |
+
label="Select Unit of Analysis",
|
668 |
+
interactive=True,
|
669 |
+
)
|
670 |
+
region_filter = gr.CheckboxGroup(
|
671 |
+
choices=[
|
672 |
+
"Africa",
|
673 |
+
"Asia",
|
674 |
+
"Europe",
|
675 |
+
"North America",
|
676 |
+
"South America",
|
677 |
+
"Oceania",
|
678 |
+
],
|
679 |
+
value=[
|
680 |
+
"Africa",
|
681 |
+
"Asia",
|
682 |
+
"Europe",
|
683 |
+
"North America",
|
684 |
+
"South America",
|
685 |
+
"Oceania",
|
686 |
+
],
|
687 |
+
label="Filter by Region",
|
688 |
+
interactive=True,
|
689 |
+
)
|
690 |
+
family_filter = gr.CheckboxGroup(
|
691 |
+
choices=[
|
692 |
+
"Indo-European",
|
693 |
+
"Sino-Tibetan",
|
694 |
+
"Afro-Asiatic",
|
695 |
+
"Dravidian",
|
696 |
+
"Uralic",
|
697 |
+
"Austronesian",
|
698 |
+
"Other",
|
699 |
+
],
|
700 |
+
value=[
|
701 |
+
"Indo-European",
|
702 |
+
"Sino-Tibetan",
|
703 |
+
"Afro-Asiatic",
|
704 |
+
"Dravidian",
|
705 |
+
"Uralic",
|
706 |
+
"Austronesian",
|
707 |
+
"Other",
|
708 |
+
],
|
709 |
+
label="Filter by Language Family",
|
710 |
+
interactive=True,
|
711 |
+
)
|
712 |
+
speakers_filter = RangeSlider(
|
713 |
+
minimum=0,
|
714 |
+
maximum=100_000_000,
|
715 |
+
value=(0, 100_000_000),
|
716 |
+
label="Filter by Number of Speakers",
|
717 |
+
interactive=True,
|
718 |
+
)
|
719 |
+
with gr.Row():
|
720 |
+
start_metric = METRICS["t2t"][0]
|
721 |
+
metric = gr.Dropdown(
|
722 |
+
choices=[metric["display_name"] for metric in METRICS["t2t"]],
|
723 |
+
value=start_metric["display_name"],
|
724 |
+
label="Main metric to display in figures and map",
|
725 |
+
interactive=True,
|
726 |
+
)
|
727 |
+
|
728 |
+
metric_explanation = create_metric_explanation(start_metric)
|
729 |
|
730 |
gr.Markdown("## Model Comparison")
|
731 |
+
create_leaderboard_df(start_metric)
|
732 |
+
|
733 |
model_comparison_plot = gr.Plot(
|
734 |
value=create_model_comparison_plot(start_metric),
|
735 |
label="Model Comparison",
|
|
|
748 |
elem_classes="fullwidth-plot",
|
749 |
)
|
750 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
751 |
def update_component(fn, metric_choice):
|
752 |
metric = [m for m in METRICS if m["display_name"] == metric_choice][0]
|
753 |
return fn(metric)
|
754 |
|
755 |
+
# metric.change(
|
756 |
+
# fn=partial(update_component, create_metric_explanation),
|
757 |
+
# inputs=metric,
|
758 |
+
# outputs=metric_explanation,
|
759 |
+
# )
|
760 |
+
# metric.change(
|
761 |
+
# fn=partial(update_component, create_model_comparison_plot),
|
762 |
+
# inputs=metric,
|
763 |
+
# outputs=model_comparison_plot,
|
764 |
+
# )
|
765 |
+
# metric.change(
|
766 |
+
# fn=partial(update_component, create_scatter_plot),
|
767 |
+
# inputs=metric,
|
768 |
+
# outputs=scatter_plot,
|
769 |
+
# )
|
770 |
+
# metric.change(
|
771 |
+
# fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
|
772 |
+
# )
|
773 |
+
|
774 |
+
with gr.Accordion("Methodology", open=False):
|
775 |
+
gr.Markdown(
|
776 |
+
"""
|
777 |
+
## Methodology
|
778 |
+
|
779 |
+
### Benchmark Data
|
780 |
+
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
781 |
+
|
782 |
+
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
|
783 |
+
|
784 |
+
### AI Models
|
785 |
+
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
|
786 |
+
|
787 |
+
### Evaluation Tasks
|
788 |
+
Our benchmark includes three core tasks to assess different aspects of language understanding:
|
789 |
+
|
790 |
+
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
|
791 |
+
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
|
792 |
+
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
|
793 |
+
|
794 |
+
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
|
795 |
+
- Group sentences by URL into paragraphs with the same topic
|
796 |
+
- Use the 5 most common topics, encoded as numbers rather than English labels
|
797 |
+
- Provide 5 examples of each topic as few-shot examples
|
798 |
+
- Test the model's ability to classify new text
|
799 |
+
- Report accuracy as the primary metric
|
800 |
+
|
801 |
+
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
|
802 |
+
- Mask approximately 5% of each sentence at a random position
|
803 |
+
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
|
804 |
+
- Evaluate predictions using ChrF score against the original text
|
805 |
+
|
806 |
+
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
807 |
+
""",
|
808 |
+
container=True,
|
809 |
+
)
|
810 |
|
811 |
demo.launch()
|
pyproject.toml
CHANGED
@@ -5,6 +5,7 @@ description = "Add your description here"
|
|
5 |
readme = "README.md"
|
6 |
requires-python = ">=3.10"
|
7 |
dependencies = [
|
|
|
8 |
"gradio>=5.16.2",
|
9 |
"language-data>=1.3.0",
|
10 |
"pandas>=2.2.3",
|
|
|
5 |
readme = "README.md"
|
6 |
requires-python = ">=3.10"
|
7 |
dependencies = [
|
8 |
+
"gradio-rangeslider>=0.0.8",
|
9 |
"gradio>=5.16.2",
|
10 |
"language-data>=1.3.0",
|
11 |
"pandas>=2.2.3",
|
uv.lock
CHANGED
@@ -695,6 +695,18 @@ wheels = [
|
|
695 |
{ url = "https://files.pythonhosted.org/packages/16/52/4fe9dfc2239e7b748ad8dc3b80ad8755f5c9378432715193586c3ab74bf9/gradio_client-1.7.1-py3-none-any.whl", hash = "sha256:d7737bc473a2093549c06004379c42f0a3510a98095cf7cea9033837e252149f", size = 321994 },
|
696 |
]
|
697 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
698 |
[[package]]
|
699 |
name = "h11"
|
700 |
version = "0.14.0"
|
@@ -958,6 +970,7 @@ version = "0.1.0"
|
|
958 |
source = { virtual = "." }
|
959 |
dependencies = [
|
960 |
{ name = "gradio" },
|
|
|
961 |
{ name = "language-data" },
|
962 |
{ name = "pandas" },
|
963 |
{ name = "plotly" },
|
@@ -987,6 +1000,7 @@ dev = [
|
|
987 |
[package.metadata]
|
988 |
requires-dist = [
|
989 |
{ name = "gradio", specifier = ">=5.16.2" },
|
|
|
990 |
{ name = "language-data", specifier = ">=1.3.0" },
|
991 |
{ name = "pandas", specifier = ">=2.2.3" },
|
992 |
{ name = "plotly", specifier = ">=6.0.0" },
|
|
|
695 |
{ url = "https://files.pythonhosted.org/packages/16/52/4fe9dfc2239e7b748ad8dc3b80ad8755f5c9378432715193586c3ab74bf9/gradio_client-1.7.1-py3-none-any.whl", hash = "sha256:d7737bc473a2093549c06004379c42f0a3510a98095cf7cea9033837e252149f", size = 321994 },
|
696 |
]
|
697 |
|
698 |
+
[[package]]
|
699 |
+
name = "gradio-rangeslider"
|
700 |
+
version = "0.0.8"
|
701 |
+
source = { registry = "https://pypi.org/simple" }
|
702 |
+
dependencies = [
|
703 |
+
{ name = "gradio" },
|
704 |
+
]
|
705 |
+
sdist = { url = "https://files.pythonhosted.org/packages/88/93/aa1723076cc1056279e43f46d804f663c794f2e098c0650d150b46372f62/gradio_rangeslider-0.0.8.tar.gz", hash = "sha256:414fd4b093d4327cc86c51b4e81e53df19664023f95bed95e3ecc03071fa1ea0", size = 1263683 }
|
706 |
+
wheels = [
|
707 |
+
{ url = "https://files.pythonhosted.org/packages/54/44/14f759678a76ffbd6e3fe8852a4f758bc5ad51b379c67b5d167cfde752f6/gradio_rangeslider-0.0.8-py3-none-any.whl", hash = "sha256:3728c44e58ec1bff0bdf236cc84f12b183fbd596fb4714d8b797585a0515f89e", size = 1224388 },
|
708 |
+
]
|
709 |
+
|
710 |
[[package]]
|
711 |
name = "h11"
|
712 |
version = "0.14.0"
|
|
|
970 |
source = { virtual = "." }
|
971 |
dependencies = [
|
972 |
{ name = "gradio" },
|
973 |
+
{ name = "gradio-rangeslider" },
|
974 |
{ name = "language-data" },
|
975 |
{ name = "pandas" },
|
976 |
{ name = "plotly" },
|
|
|
1000 |
[package.metadata]
|
1001 |
requires-dist = [
|
1002 |
{ name = "gradio", specifier = ">=5.16.2" },
|
1003 |
+
{ name = "gradio-rangeslider" },
|
1004 |
{ name = "language-data", specifier = ">=1.3.0" },
|
1005 |
{ name = "pandas", specifier = ">=2.2.3" },
|
1006 |
{ name = "plotly", specifier = ">=6.0.0" },
|