David Pomerenke
commited on
Commit
·
29c8ef6
1
Parent(s):
56081d8
Make a map
Browse files- app.py +215 -8
- evals.py +16 -2
- pyproject.toml +1 -0
- requirements.txt +2 -0
- results.json +571 -29
- uv.lock +11 -0
app.py
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
import json
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
import pandas as pd
|
5 |
import plotly.graph_objects as go
|
|
|
6 |
|
7 |
with open("results.json") as f:
|
8 |
results = json.load(f)
|
@@ -157,10 +159,17 @@ def create_model_comparison_plot(results):
|
|
157 |
fig = go.Figure(data=traces)
|
158 |
fig.update_layout(
|
159 |
title="BLEU Scores by Model and Language",
|
160 |
-
xaxis_title=
|
161 |
yaxis_title="BLEU Score",
|
162 |
barmode="group",
|
163 |
height=500,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
)
|
165 |
return fig
|
166 |
|
@@ -175,10 +184,18 @@ def create_language_stats_df(results):
|
|
175 |
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
|
176 |
)
|
177 |
|
178 |
-
model = best_score[
|
179 |
-
model_name = model.split(
|
180 |
-
model_link =
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
row = {
|
183 |
"Language": f"**{lang['language_name']}**",
|
184 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
@@ -199,7 +216,15 @@ def create_language_stats_df(results):
|
|
199 |
value=df,
|
200 |
label="Language Results",
|
201 |
show_search="search",
|
202 |
-
datatype=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
)
|
204 |
|
205 |
|
@@ -224,7 +249,7 @@ def create_scatter_plot(results):
|
|
224 |
)
|
225 |
|
226 |
fig.update_layout(
|
227 |
-
title=
|
228 |
xaxis_title="Number of Speakers (Millions)",
|
229 |
yaxis_title="Average BLEU Score",
|
230 |
height=500,
|
@@ -237,6 +262,186 @@ def create_scatter_plot(results):
|
|
237 |
return fig
|
238 |
|
239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
240 |
# Create the visualization components
|
241 |
with gr.Blocks(title="AI Language Translation Benchmark") as demo:
|
242 |
gr.Markdown("# AI Language Translation Benchmark")
|
@@ -246,11 +451,13 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
|
|
246 |
|
247 |
bar_plot = create_model_comparison_plot(results)
|
248 |
scatter_plot = create_scatter_plot(results)
|
|
|
249 |
|
250 |
create_leaderboard_df(results)
|
251 |
gr.Plot(value=bar_plot, label="Model Comparison")
|
252 |
create_language_stats_df(results)
|
253 |
-
gr.Plot(value=scatter_plot, label="
|
|
|
254 |
|
255 |
gr.Markdown(
|
256 |
"""
|
|
|
1 |
import json
|
2 |
|
3 |
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
import pandas as pd
|
6 |
import plotly.graph_objects as go
|
7 |
+
import pycountry
|
8 |
|
9 |
with open("results.json") as f:
|
10 |
results = json.load(f)
|
|
|
159 |
fig = go.Figure(data=traces)
|
160 |
fig.update_layout(
|
161 |
title="BLEU Scores by Model and Language",
|
162 |
+
xaxis_title=None,
|
163 |
yaxis_title="BLEU Score",
|
164 |
barmode="group",
|
165 |
height=500,
|
166 |
+
legend=dict(
|
167 |
+
orientation="h", # horizontal orientation
|
168 |
+
yanchor="bottom",
|
169 |
+
y=-0.3, # position below plot
|
170 |
+
xanchor="center",
|
171 |
+
x=0.5, # center horizontally
|
172 |
+
),
|
173 |
)
|
174 |
return fig
|
175 |
|
|
|
184 |
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
|
185 |
)
|
186 |
|
187 |
+
model = best_score["model"]
|
188 |
+
model_name = model.split("/")[-1] if model else "N/A"
|
189 |
+
model_link = (
|
190 |
+
f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>"
|
191 |
+
if model
|
192 |
+
else "N/A"
|
193 |
+
)
|
194 |
+
commonvoice_link = (
|
195 |
+
f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>"
|
196 |
+
if lang["commonvoice_hours"]
|
197 |
+
else "N/A"
|
198 |
+
)
|
199 |
row = {
|
200 |
"Language": f"**{lang['language_name']}**",
|
201 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
|
|
216 |
value=df,
|
217 |
label="Language Results",
|
218 |
show_search="search",
|
219 |
+
datatype=[
|
220 |
+
"markdown",
|
221 |
+
"number",
|
222 |
+
"number",
|
223 |
+
"number",
|
224 |
+
"markdown",
|
225 |
+
"number",
|
226 |
+
"markdown",
|
227 |
+
],
|
228 |
)
|
229 |
|
230 |
|
|
|
249 |
)
|
250 |
|
251 |
fig.update_layout(
|
252 |
+
title=None,
|
253 |
xaxis_title="Number of Speakers (Millions)",
|
254 |
yaxis_title="Average BLEU Score",
|
255 |
height=500,
|
|
|
262 |
return fig
|
263 |
|
264 |
|
265 |
+
def format_number(n):
|
266 |
+
"""Format number with K/M suffix"""
|
267 |
+
if n >= 1_000_000:
|
268 |
+
return f"{n/1_000_000:.1f}M"
|
269 |
+
elif n >= 1_000:
|
270 |
+
return f"{n/1_000:.0f}K"
|
271 |
+
return str(n)
|
272 |
+
|
273 |
+
|
274 |
+
def create_world_map(results):
|
275 |
+
# Collect all country data
|
276 |
+
country_data = {}
|
277 |
+
for lang in results:
|
278 |
+
if "population" not in lang or lang["bleu"] is None:
|
279 |
+
continue
|
280 |
+
|
281 |
+
for country_code, speakers in lang["population"].items():
|
282 |
+
try:
|
283 |
+
# Convert alpha_2 (2-letter) to alpha_3 (3-letter) code
|
284 |
+
country = pycountry.countries.get(alpha_2=country_code)
|
285 |
+
if country is None:
|
286 |
+
continue
|
287 |
+
|
288 |
+
iso3_code = country.alpha_3
|
289 |
+
if iso3_code not in country_data:
|
290 |
+
country_data[iso3_code] = {
|
291 |
+
"total_speakers": 0,
|
292 |
+
"weighted_bleu_sum": 0,
|
293 |
+
"languages": [],
|
294 |
+
}
|
295 |
+
|
296 |
+
country_data[iso3_code]["total_speakers"] += speakers
|
297 |
+
country_data[iso3_code]["weighted_bleu_sum"] += speakers * lang["bleu"]
|
298 |
+
country_data[iso3_code]["languages"].append(
|
299 |
+
{
|
300 |
+
"name": lang["language_name"],
|
301 |
+
"speakers": speakers,
|
302 |
+
"bleu": lang["bleu"],
|
303 |
+
}
|
304 |
+
)
|
305 |
+
except (KeyError, AttributeError):
|
306 |
+
# Skip invalid or unrecognized country codes
|
307 |
+
continue
|
308 |
+
|
309 |
+
# Calculate final weighted averages and prepare hover text
|
310 |
+
countries = []
|
311 |
+
bleu_scores = []
|
312 |
+
hover_texts = []
|
313 |
+
|
314 |
+
def make_black_bar(value, max_width=10):
|
315 |
+
filled = int(value * max_width)
|
316 |
+
return "⬛️" * filled + "⬜️" * (max_width - filled)
|
317 |
+
|
318 |
+
def make_colored_bar(value, max_width=10):
|
319 |
+
"""Create a colored bar using Unicode blocks
|
320 |
+
🟦 for high values (>0.35)
|
321 |
+
🟨 for medium values (0.25-0.35)
|
322 |
+
🟥 for low values (<0.25)
|
323 |
+
⬜ for empty space
|
324 |
+
"""
|
325 |
+
filled = int(value * max_width)
|
326 |
+
filled = max(0, min(filled, max_width))
|
327 |
+
empty = max_width - filled
|
328 |
+
|
329 |
+
if value > 0.35:
|
330 |
+
return "🟦" * filled + "⬜" * empty
|
331 |
+
elif value > 0.25:
|
332 |
+
return "🟨" * filled + "⬜" * empty
|
333 |
+
else:
|
334 |
+
return "🟥" * filled + "⬜" * empty
|
335 |
+
|
336 |
+
for country_code, data in country_data.items():
|
337 |
+
weighted_avg = data["weighted_bleu_sum"] / data["total_speakers"]
|
338 |
+
|
339 |
+
try:
|
340 |
+
country_name = pycountry.countries.get(alpha_3=country_code).name
|
341 |
+
except AttributeError:
|
342 |
+
country_name = country_code
|
343 |
+
|
344 |
+
# Sort languages by number of speakers
|
345 |
+
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
|
346 |
+
total_speakers = sum(lang["speakers"] for lang in langs)
|
347 |
+
|
348 |
+
# Take top 5 languages and summarize the rest
|
349 |
+
main_langs = langs[:5]
|
350 |
+
other_langs = langs[5:]
|
351 |
+
|
352 |
+
# Create language rows with bars
|
353 |
+
lang_rows = []
|
354 |
+
for lang in main_langs:
|
355 |
+
percentage = (lang["speakers"] / total_speakers) * 100
|
356 |
+
speaker_bar = make_black_bar(percentage / 100)
|
357 |
+
bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
|
358 |
+
|
359 |
+
lang_rows.append(
|
360 |
+
f"<b>{lang['name']}</b><br>"
|
361 |
+
f"{speaker_bar} {format_number(lang['speakers'])} speakers<br>"
|
362 |
+
f"{bleu_bar} {lang['bleu']:.3f} BLEU<br>"
|
363 |
+
)
|
364 |
+
|
365 |
+
# Add summary for other languages if any
|
366 |
+
if other_langs:
|
367 |
+
other_speakers = sum(lang["speakers"] for lang in other_langs)
|
368 |
+
other_percentage = (other_speakers / total_speakers) * 100
|
369 |
+
other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
|
370 |
+
other_langs
|
371 |
+
)
|
372 |
+
|
373 |
+
speaker_bar = make_black_bar(other_percentage / 100)
|
374 |
+
bleu_bar = make_colored_bar((other_avg_bleu - 0.2) / 0.2)
|
375 |
+
|
376 |
+
lang_rows.append(
|
377 |
+
f"<b>+{len(other_langs)} other languages</b><br>"
|
378 |
+
f"{speaker_bar} {format_number(other_speakers)} speakers<br>"
|
379 |
+
f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
|
380 |
+
)
|
381 |
+
|
382 |
+
# Create overall BLEU visualization
|
383 |
+
bleu_percentage = (weighted_avg - 0.2) / 0.2 # Scale from 0.2-0.4 to 0-1
|
384 |
+
overall_bleu_bar = make_colored_bar(bleu_percentage)
|
385 |
+
|
386 |
+
hover_text = (
|
387 |
+
f"<b>{country_name}</b><br><br>"
|
388 |
+
f"{format_number(data['total_speakers'])} speakers*<br>"
|
389 |
+
f"{overall_bleu_bar} {weighted_avg:.3f} BLEU<br><br>"
|
390 |
+
f"<b>Languages:</b><br><br>"
|
391 |
+
f"{'<br>'.join(lang_rows)}"
|
392 |
+
)
|
393 |
+
|
394 |
+
countries.append(country_code)
|
395 |
+
bleu_scores.append(weighted_avg)
|
396 |
+
hover_texts.append(hover_text)
|
397 |
+
|
398 |
+
# Create the choropleth map
|
399 |
+
fig = go.Figure(
|
400 |
+
data=go.Choropleth(
|
401 |
+
locations=countries,
|
402 |
+
locationmode="ISO-3",
|
403 |
+
z=bleu_scores,
|
404 |
+
text=hover_texts,
|
405 |
+
hoverinfo="text",
|
406 |
+
colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
|
407 |
+
colorbar=dict(
|
408 |
+
title="BLEU Score",
|
409 |
+
orientation="h", # horizontal orientation
|
410 |
+
y=-0.2, # position below map
|
411 |
+
yanchor="bottom",
|
412 |
+
len=0.5, # length of colorbar
|
413 |
+
x=0.5, # center horizontally
|
414 |
+
xanchor="center",
|
415 |
+
thickness=20, # make it a bit thicker when horizontal
|
416 |
+
),
|
417 |
+
zmin=0.2,
|
418 |
+
zmax=0.5,
|
419 |
+
)
|
420 |
+
)
|
421 |
+
|
422 |
+
fig.update_layout(
|
423 |
+
title=dict(text="BLEU Score by Country", x=0.5, xanchor="center"),
|
424 |
+
geo=dict(
|
425 |
+
showframe=True,
|
426 |
+
showcoastlines=True,
|
427 |
+
projection_type="equal earth",
|
428 |
+
showland=True,
|
429 |
+
landcolor="#f8f9fa",
|
430 |
+
coastlinecolor="#e0e0e0",
|
431 |
+
countrycolor="#e0e0e0",
|
432 |
+
),
|
433 |
+
height=600,
|
434 |
+
margin=dict(l=0, r=0, t=30, b=0),
|
435 |
+
paper_bgcolor="white",
|
436 |
+
hoverlabel=dict(
|
437 |
+
bgcolor="beige",
|
438 |
+
font_size=12,
|
439 |
+
),
|
440 |
+
)
|
441 |
+
|
442 |
+
return fig
|
443 |
+
|
444 |
+
|
445 |
# Create the visualization components
|
446 |
with gr.Blocks(title="AI Language Translation Benchmark") as demo:
|
447 |
gr.Markdown("# AI Language Translation Benchmark")
|
|
|
451 |
|
452 |
bar_plot = create_model_comparison_plot(results)
|
453 |
scatter_plot = create_scatter_plot(results)
|
454 |
+
world_map = create_world_map(results)
|
455 |
|
456 |
create_leaderboard_df(results)
|
457 |
gr.Plot(value=bar_plot, label="Model Comparison")
|
458 |
create_language_stats_df(results)
|
459 |
+
gr.Plot(value=scatter_plot, label="Speaker population vs BLEU")
|
460 |
+
gr.Plot(value=world_map, container=False, elem_classes="fullwidth-plot")
|
461 |
|
462 |
gr.Markdown(
|
463 |
"""
|
evals.py
CHANGED
@@ -62,6 +62,15 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
|
|
62 |
)
|
63 |
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
def script_name(iso15924):
|
66 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
67 |
|
@@ -246,8 +255,13 @@ async def main():
|
|
246 |
"speakers": language.speakers,
|
247 |
"scores": results_for_language,
|
248 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
249 |
-
"commonvoice_hours": language.commonvoice_hours
|
250 |
-
|
|
|
|
|
|
|
|
|
|
|
251 |
}
|
252 |
)
|
253 |
with open("results.json", "w") as f:
|
|
|
62 |
)
|
63 |
|
64 |
|
65 |
+
def population(bcp_47):
|
66 |
+
items = {
|
67 |
+
re.sub(r"^[a-z]+-", "", lang): pop
|
68 |
+
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
|
69 |
+
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
|
70 |
+
}
|
71 |
+
return items
|
72 |
+
|
73 |
+
|
74 |
def script_name(iso15924):
|
75 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
76 |
|
|
|
255 |
"speakers": language.speakers,
|
256 |
"scores": results_for_language,
|
257 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
258 |
+
"commonvoice_hours": language.commonvoice_hours
|
259 |
+
if not pd.isna(language.commonvoice_hours)
|
260 |
+
else None,
|
261 |
+
"commonvoice_locale": language.commonvoice_locale
|
262 |
+
if not pd.isna(language.commonvoice_locale)
|
263 |
+
else None,
|
264 |
+
"population": population(language.bcp_47),
|
265 |
}
|
266 |
)
|
267 |
with open("results.json", "w") as f:
|
pyproject.toml
CHANGED
@@ -8,6 +8,7 @@ dependencies = [
|
|
8 |
"gradio>=5.16.2",
|
9 |
"pandas>=2.2.3",
|
10 |
"plotly>=6.0.0",
|
|
|
11 |
]
|
12 |
|
13 |
[tool.uv]
|
|
|
8 |
"gradio>=5.16.2",
|
9 |
"pandas>=2.2.3",
|
10 |
"plotly>=6.0.0",
|
11 |
+
"pycountry>=24.6.1",
|
12 |
]
|
13 |
|
14 |
[tool.uv]
|
requirements.txt
CHANGED
@@ -88,6 +88,8 @@ pillow==11.1.0
|
|
88 |
# via gradio
|
89 |
plotly==6.0.0
|
90 |
# via languagebench (pyproject.toml)
|
|
|
|
|
91 |
pydantic==2.10.6
|
92 |
# via
|
93 |
# fastapi
|
|
|
88 |
# via gradio
|
89 |
plotly==6.0.0
|
90 |
# via languagebench (pyproject.toml)
|
91 |
+
pycountry==24.6.1
|
92 |
+
# via languagebench (pyproject.toml)
|
93 |
pydantic==2.10.6
|
94 |
# via
|
95 |
# fastapi
|
results.json
CHANGED
@@ -31,7 +31,164 @@
|
|
31 |
],
|
32 |
"bleu": 0.5035795595158651,
|
33 |
"commonvoice_hours": 2649.0,
|
34 |
-
"commonvoice_locale": "en"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
},
|
36 |
{
|
37 |
"language_name": "Chinese",
|
@@ -45,7 +202,29 @@
|
|
45 |
],
|
46 |
"bleu": 0.35763875438716014,
|
47 |
"commonvoice_hours": 422.0,
|
48 |
-
"commonvoice_locale": "zh-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
},
|
50 |
{
|
51 |
"language_name": "Hindi",
|
@@ -59,7 +238,15 @@
|
|
59 |
],
|
60 |
"bleu": 0.33760351976648345,
|
61 |
"commonvoice_hours": 16.0,
|
62 |
-
"commonvoice_locale": "hi"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
},
|
64 |
{
|
65 |
"language_name": "Spanish",
|
@@ -73,7 +260,48 @@
|
|
73 |
],
|
74 |
"bleu": 0.3600460831160618,
|
75 |
"commonvoice_hours": 446.0,
|
76 |
-
"commonvoice_locale": "es"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
},
|
78 |
{
|
79 |
"language_name": "Arabic",
|
@@ -87,7 +315,47 @@
|
|
87 |
],
|
88 |
"bleu": 0.3046598747480405,
|
89 |
"commonvoice_hours": 91.0,
|
90 |
-
"commonvoice_locale": "ar"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
},
|
92 |
{
|
93 |
"language_name": "Urdu",
|
@@ -101,7 +369,14 @@
|
|
101 |
],
|
102 |
"bleu": 0.331647033312127,
|
103 |
"commonvoice_hours": 76.0,
|
104 |
-
"commonvoice_locale": "ur"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
},
|
106 |
{
|
107 |
"language_name": "French",
|
@@ -115,7 +390,71 @@
|
|
115 |
],
|
116 |
"bleu": 0.3141809404018014,
|
117 |
"commonvoice_hours": 1051.0,
|
118 |
-
"commonvoice_locale": "fr"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
},
|
120 |
{
|
121 |
"language_name": "Bangla",
|
@@ -129,7 +468,14 @@
|
|
129 |
],
|
130 |
"bleu": 0.27472181972977344,
|
131 |
"commonvoice_hours": 49.0,
|
132 |
-
"commonvoice_locale": "bn"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
},
|
134 |
{
|
135 |
"language_name": "Portuguese",
|
@@ -163,7 +509,25 @@
|
|
163 |
],
|
164 |
"bleu": 0.367787171884892,
|
165 |
"commonvoice_hours": 176.0,
|
166 |
-
"commonvoice_locale": "pt"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
},
|
168 |
{
|
169 |
"language_name": "Punjabi",
|
@@ -197,7 +561,15 @@
|
|
197 |
],
|
198 |
"bleu": 0.31594664710428266,
|
199 |
"commonvoice_hours": 2.3,
|
200 |
-
"commonvoice_locale": "pa-IN"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
},
|
202 |
{
|
203 |
"language_name": "Russian",
|
@@ -211,7 +583,32 @@
|
|
211 |
],
|
212 |
"bleu": 0.2920291935463745,
|
213 |
"commonvoice_hours": 241.0,
|
214 |
-
"commonvoice_locale": "ru"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
},
|
216 |
{
|
217 |
"language_name": "Swahili",
|
@@ -245,7 +642,18 @@
|
|
245 |
],
|
246 |
"bleu": 0.3018786362743097,
|
247 |
"commonvoice_hours": 411.0,
|
248 |
-
"commonvoice_locale": "sw"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
},
|
250 |
{
|
251 |
"language_name": "Indonesian",
|
@@ -279,7 +687,11 @@
|
|
279 |
],
|
280 |
"bleu": 0.31132422822400946,
|
281 |
"commonvoice_hours": 33.0,
|
282 |
-
"commonvoice_locale": "id"
|
|
|
|
|
|
|
|
|
283 |
},
|
284 |
{
|
285 |
"language_name": "German",
|
@@ -313,7 +725,36 @@
|
|
313 |
],
|
314 |
"bleu": 0.3992689214831344,
|
315 |
"commonvoice_hours": 1357.0,
|
316 |
-
"commonvoice_locale": "de"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
},
|
318 |
{
|
319 |
"language_name": "Japanese",
|
@@ -327,7 +768,12 @@
|
|
327 |
],
|
328 |
"bleu": 0.2954810072264808,
|
329 |
"commonvoice_hours": 222.0,
|
330 |
-
"commonvoice_locale": "ja"
|
|
|
|
|
|
|
|
|
|
|
331 |
},
|
332 |
{
|
333 |
"language_name": "Telugu",
|
@@ -341,7 +787,10 @@
|
|
341 |
],
|
342 |
"bleu": 0.37949545228579734,
|
343 |
"commonvoice_hours": 0.3,
|
344 |
-
"commonvoice_locale": "te"
|
|
|
|
|
|
|
345 |
},
|
346 |
{
|
347 |
"language_name": "Marathi",
|
@@ -355,7 +804,10 @@
|
|
355 |
],
|
356 |
"bleu": 0.2852384896861461,
|
357 |
"commonvoice_hours": 20.0,
|
358 |
-
"commonvoice_locale": "mr"
|
|
|
|
|
|
|
359 |
},
|
360 |
{
|
361 |
"language_name": "Javanese",
|
@@ -389,7 +841,11 @@
|
|
389 |
],
|
390 |
"bleu": 0.2505244065073906,
|
391 |
"commonvoice_hours": 0.0,
|
392 |
-
"commonvoice_locale": "jv"
|
|
|
|
|
|
|
|
|
393 |
},
|
394 |
{
|
395 |
"language_name": "Vietnamese",
|
@@ -403,7 +859,13 @@
|
|
403 |
],
|
404 |
"bleu": 0.2956750563565745,
|
405 |
"commonvoice_hours": 5.9,
|
406 |
-
"commonvoice_locale": "vi"
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
},
|
408 |
{
|
409 |
"language_name": "Tamil",
|
@@ -417,7 +879,17 @@
|
|
417 |
],
|
418 |
"bleu": 0.27547489589987734,
|
419 |
"commonvoice_hours": 234.0,
|
420 |
-
"commonvoice_locale": "ta"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
},
|
422 |
{
|
423 |
"language_name": "Persian",
|
@@ -431,7 +903,18 @@
|
|
431 |
],
|
432 |
"bleu": 0.2858012364771329,
|
433 |
"commonvoice_hours": 370.0,
|
434 |
-
"commonvoice_locale": "fa"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
435 |
},
|
436 |
{
|
437 |
"language_name": "Turkish",
|
@@ -465,7 +948,21 @@
|
|
465 |
],
|
466 |
"bleu": 0.30402386618673855,
|
467 |
"commonvoice_hours": 127.0,
|
468 |
-
"commonvoice_locale": "tr"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
},
|
470 |
{
|
471 |
"language_name": "Cantonese",
|
@@ -499,7 +996,12 @@
|
|
499 |
],
|
500 |
"bleu": 0.27975991005230577,
|
501 |
"commonvoice_hours": 203.0,
|
502 |
-
"commonvoice_locale": "yue"
|
|
|
|
|
|
|
|
|
|
|
503 |
},
|
504 |
{
|
505 |
"language_name": "Korean",
|
@@ -513,7 +1015,16 @@
|
|
513 |
],
|
514 |
"bleu": 0.24501349273295708,
|
515 |
"commonvoice_hours": 1.7,
|
516 |
-
"commonvoice_locale": "ko"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
517 |
},
|
518 |
{
|
519 |
"language_name": "Italian",
|
@@ -527,7 +1038,24 @@
|
|
527 |
],
|
528 |
"bleu": 0.3273249067267197,
|
529 |
"commonvoice_hours": 362.0,
|
530 |
-
"commonvoice_locale": "it"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
531 |
},
|
532 |
{
|
533 |
"language_name": "Filipino",
|
@@ -561,7 +1089,12 @@
|
|
561 |
],
|
562 |
"bleu": 0.3353425581350746,
|
563 |
"commonvoice_hours": 0.0,
|
564 |
-
"commonvoice_locale": "tl"
|
|
|
|
|
|
|
|
|
|
|
565 |
},
|
566 |
{
|
567 |
"language_name": "Egyptian Arabic",
|
@@ -574,8 +1107,11 @@
|
|
574 |
}
|
575 |
],
|
576 |
"bleu": 0.23431638822117362,
|
577 |
-
"commonvoice_hours":
|
578 |
-
"commonvoice_locale":
|
|
|
|
|
|
|
579 |
},
|
580 |
{
|
581 |
"language_name": "Gujarati",
|
@@ -589,6 +1125,12 @@
|
|
589 |
],
|
590 |
"bleu": 0.27834507803114356,
|
591 |
"commonvoice_hours": 0.0,
|
592 |
-
"commonvoice_locale": "gu-IN"
|
|
|
|
|
|
|
|
|
|
|
|
|
593 |
}
|
594 |
]
|
|
|
31 |
],
|
32 |
"bleu": 0.5035795595158651,
|
33 |
"commonvoice_hours": 2649.0,
|
34 |
+
"commonvoice_locale": "en",
|
35 |
+
"population": {
|
36 |
+
"AC": 931,
|
37 |
+
"AE": 4996040,
|
38 |
+
"AG": 84434,
|
39 |
+
"AI": 17186,
|
40 |
+
"AQ": 300,
|
41 |
+
"AR": 3183537,
|
42 |
+
"AS": 47954,
|
43 |
+
"AT": 6467398,
|
44 |
+
"AU": 24447840,
|
45 |
+
"AW": 2986,
|
46 |
+
"BA": 1726016,
|
47 |
+
"BB": 294560,
|
48 |
+
"BD": 29277180,
|
49 |
+
"BE": 6915213,
|
50 |
+
"BG": 1741725,
|
51 |
+
"BI": 6289,
|
52 |
+
"BM": 66010,
|
53 |
+
"BN": 7896,
|
54 |
+
"BR": 16937280,
|
55 |
+
"BS": 337721,
|
56 |
+
"BT": 86055,
|
57 |
+
"BV": 1,
|
58 |
+
"BW": 1876956,
|
59 |
+
"BZ": 399598,
|
60 |
+
"CA": 32416926,
|
61 |
+
"CC": 101,
|
62 |
+
"CH": 5126434,
|
63 |
+
"CK": 8574,
|
64 |
+
"CL": 1727746,
|
65 |
+
"CM": 10543100,
|
66 |
+
"CN": 62731,
|
67 |
+
"CP": 1,
|
68 |
+
"CQ": 482,
|
69 |
+
"CX": 1389,
|
70 |
+
"CY": 924676,
|
71 |
+
"CZ": 2889675,
|
72 |
+
"DE": 51302208,
|
73 |
+
"DG": 495,
|
74 |
+
"DK": 5047693,
|
75 |
+
"DM": 69788,
|
76 |
+
"DO": 7980,
|
77 |
+
"DZ": 3008103,
|
78 |
+
"EE": 614310,
|
79 |
+
"EG": 36443400,
|
80 |
+
"ER": 3587908,
|
81 |
+
"ES": 12003792,
|
82 |
+
"ET": 46488590,
|
83 |
+
"FI": 3900169,
|
84 |
+
"FJ": 879816,
|
85 |
+
"FK": 2814,
|
86 |
+
"FM": 58389,
|
87 |
+
"FR": 26460798,
|
88 |
+
"GB": 64445878,
|
89 |
+
"GD": 108570,
|
90 |
+
"GG": 67052,
|
91 |
+
"GH": 6161442,
|
92 |
+
"GI": 23665,
|
93 |
+
"GM": 869600,
|
94 |
+
"GR": 5409621,
|
95 |
+
"GS": 20,
|
96 |
+
"GU": 153321,
|
97 |
+
"GY": 750204,
|
98 |
+
"HK": 3697454,
|
99 |
+
"HM": 1,
|
100 |
+
"HN": 40635,
|
101 |
+
"HR": 2071598,
|
102 |
+
"HU": 1954366,
|
103 |
+
"IE": 5073039,
|
104 |
+
"IL": 7374158,
|
105 |
+
"IM": 90499,
|
106 |
+
"IN": 251957100,
|
107 |
+
"IO": 3500,
|
108 |
+
"IQ": 13605445,
|
109 |
+
"IT": 21216918,
|
110 |
+
"JE": 96019,
|
111 |
+
"JM": 2752399,
|
112 |
+
"JO": 4869270,
|
113 |
+
"KE": 10170301,
|
114 |
+
"KI": 111796,
|
115 |
+
"KN": 52745,
|
116 |
+
"KY": 60705,
|
117 |
+
"KZ": 2863785,
|
118 |
+
"LB": 2187844,
|
119 |
+
"LC": 149838,
|
120 |
+
"LK": 2288920,
|
121 |
+
"LR": 4210839,
|
122 |
+
"LS": 531719,
|
123 |
+
"LT": 1037955,
|
124 |
+
"LU": 351893,
|
125 |
+
"LV": 865366,
|
126 |
+
"MA": 4978638,
|
127 |
+
"MG": 4852026,
|
128 |
+
"MH": 72463,
|
129 |
+
"MO": 14133,
|
130 |
+
"MP": 49890,
|
131 |
+
"MS": 3492,
|
132 |
+
"MT": 402395,
|
133 |
+
"MU": 993146,
|
134 |
+
"MV": 293928,
|
135 |
+
"MW": 13353858,
|
136 |
+
"MX": 16724500,
|
137 |
+
"MY": 6856941,
|
138 |
+
"NA": 184105,
|
139 |
+
"NF": 1678,
|
140 |
+
"NG": 113434840,
|
141 |
+
"NL": 15552360,
|
142 |
+
"NP": 909837,
|
143 |
+
"NR": 9350,
|
144 |
+
"NU": 1120,
|
145 |
+
"NZ": 4826970,
|
146 |
+
"PA": 545171,
|
147 |
+
"PG": 3629730,
|
148 |
+
"PH": 69875840,
|
149 |
+
"PK": 116750500,
|
150 |
+
"PL": 12633159,
|
151 |
+
"PM": 187,
|
152 |
+
"PN": 46,
|
153 |
+
"PR": 1562644,
|
154 |
+
"PT": 2781729,
|
155 |
+
"PW": 1887,
|
156 |
+
"RO": 6603899,
|
157 |
+
"RW": 1906860,
|
158 |
+
"SB": 685097,
|
159 |
+
"SC": 36473,
|
160 |
+
"SD": 27792576,
|
161 |
+
"SE": 8774150,
|
162 |
+
"SG": 5774984,
|
163 |
+
"SH": 5425,
|
164 |
+
"SI": 1240581,
|
165 |
+
"SK": 1414556,
|
166 |
+
"SL": 2318726,
|
167 |
+
"SS": 2851524,
|
168 |
+
"SX": 29816,
|
169 |
+
"SZ": 883584,
|
170 |
+
"TA": 272,
|
171 |
+
"TC": 54807,
|
172 |
+
"TH": 18623898,
|
173 |
+
"TK": 1285,
|
174 |
+
"TO": 29707,
|
175 |
+
"TR": 13942975,
|
176 |
+
"TT": 1063735,
|
177 |
+
"TV": 1066,
|
178 |
+
"TZ": 40401432,
|
179 |
+
"UG": 1686867,
|
180 |
+
"UM": 316,
|
181 |
+
"US": 319333440,
|
182 |
+
"VC": 97334,
|
183 |
+
"VG": 36633,
|
184 |
+
"VI": 79676,
|
185 |
+
"VU": 247616,
|
186 |
+
"WS": 4279,
|
187 |
+
"YE": 2689596,
|
188 |
+
"ZA": 17503716,
|
189 |
+
"ZM": 2788256,
|
190 |
+
"ZW": 6109446
|
191 |
+
}
|
192 |
},
|
193 |
{
|
194 |
"language_name": "Chinese",
|
|
|
202 |
],
|
203 |
"bleu": 0.35763875438716014,
|
204 |
"commonvoice_hours": 422.0,
|
205 |
+
"commonvoice_locale": "zh-TW",
|
206 |
+
"population": {
|
207 |
+
"AU": 534796,
|
208 |
+
"BN": 51093,
|
209 |
+
"CA": 678494,
|
210 |
+
"CN": 1254618000,
|
211 |
+
"GB": 197283,
|
212 |
+
"GF": 4988,
|
213 |
+
"HK": 7249910,
|
214 |
+
"ID": 2456639,
|
215 |
+
"MN": 44352,
|
216 |
+
"MO": 632892,
|
217 |
+
"MY": 5550857,
|
218 |
+
"PA": 5841,
|
219 |
+
"PF": 23019,
|
220 |
+
"PH": 797021,
|
221 |
+
"SG": 4781438,
|
222 |
+
"SR": 6705,
|
223 |
+
"TH": 1241593,
|
224 |
+
"TW": 22422850,
|
225 |
+
"US": 2295209,
|
226 |
+
"VN": 1085934
|
227 |
+
}
|
228 |
},
|
229 |
{
|
230 |
"language_name": "Hindi",
|
|
|
238 |
],
|
239 |
"bleu": 0.33760351976648345,
|
240 |
"commonvoice_hours": 16.0,
|
241 |
+
"commonvoice_locale": "hi-IN",
|
242 |
+
"population": {
|
243 |
+
"CA": 188470,
|
244 |
+
"FJ": 411829,
|
245 |
+
"IN": 545022990,
|
246 |
+
"NP": 127377,
|
247 |
+
"UG": 2206,
|
248 |
+
"ZA": 1129272
|
249 |
+
}
|
250 |
},
|
251 |
{
|
252 |
"language_name": "Spanish",
|
|
|
260 |
],
|
261 |
"bleu": 0.3600460831160618,
|
262 |
"commonvoice_hours": 446.0,
|
263 |
+
"commonvoice_locale": "es",
|
264 |
+
"population": {
|
265 |
+
"AD": 33110,
|
266 |
+
"AR": 45479100,
|
267 |
+
"BO": 7100339,
|
268 |
+
"BR": 76218,
|
269 |
+
"BZ": 111887,
|
270 |
+
"CA": 603106,
|
271 |
+
"CL": 17823064,
|
272 |
+
"CO": 45648864,
|
273 |
+
"CR": 4843090,
|
274 |
+
"CU": 11059100,
|
275 |
+
"CW": 5751,
|
276 |
+
"DE": 4809582,
|
277 |
+
"DO": 8189766,
|
278 |
+
"EA": 147000,
|
279 |
+
"EC": 16228704,
|
280 |
+
"ES": 49515642,
|
281 |
+
"FR": 8820266,
|
282 |
+
"GB": 5260888,
|
283 |
+
"GI": 14790,
|
284 |
+
"GQ": 727475,
|
285 |
+
"GT": 15952569,
|
286 |
+
"HN": 7203565,
|
287 |
+
"IC": 2056618,
|
288 |
+
"MA": 23115,
|
289 |
+
"MX": 106779500,
|
290 |
+
"NI": 4838683,
|
291 |
+
"PA": 2686915,
|
292 |
+
"PE": 23297950,
|
293 |
+
"PH": 33846110,
|
294 |
+
"PR": 2774491,
|
295 |
+
"PT": 1030270,
|
296 |
+
"PY": 230134,
|
297 |
+
"RO": 2130290,
|
298 |
+
"SV": 5768179,
|
299 |
+
"SX": 4823,
|
300 |
+
"TT": 4110,
|
301 |
+
"US": 31933344,
|
302 |
+
"UY": 2981097,
|
303 |
+
"VE": 23488572
|
304 |
+
}
|
305 |
},
|
306 |
{
|
307 |
"language_name": "Arabic",
|
|
|
315 |
],
|
316 |
"bleu": 0.3046598747480405,
|
317 |
"commonvoice_hours": 91.0,
|
318 |
+
"commonvoice_locale": "ar",
|
319 |
+
"population": {
|
320 |
+
"AE": 7793822,
|
321 |
+
"BH": 1309350,
|
322 |
+
"CA": 565412,
|
323 |
+
"CM": 108206,
|
324 |
+
"CY": 1267,
|
325 |
+
"DJ": 67292,
|
326 |
+
"DZ": 31799946,
|
327 |
+
"EG": 97876560,
|
328 |
+
"EH": 652271,
|
329 |
+
"ER": 297979,
|
330 |
+
"GB": 197283,
|
331 |
+
"IL": 1735096,
|
332 |
+
"IQ": 26433436,
|
333 |
+
"IR": 1698466,
|
334 |
+
"JO": 10820600,
|
335 |
+
"KE": 24623,
|
336 |
+
"KM": 558545,
|
337 |
+
"KW": 2993710,
|
338 |
+
"LB": 4703865,
|
339 |
+
"LY": 5099000,
|
340 |
+
"MA": 22048254,
|
341 |
+
"ML": 175981,
|
342 |
+
"MR": 3404658,
|
343 |
+
"NE": 47822,
|
344 |
+
"NG": 151960,
|
345 |
+
"OM": 3778520,
|
346 |
+
"PS": 4818260,
|
347 |
+
"QA": 2175311,
|
348 |
+
"SA": 34173500,
|
349 |
+
"SD": 27792576,
|
350 |
+
"SO": 3997414,
|
351 |
+
"SS": 2851524,
|
352 |
+
"SY": 15518720,
|
353 |
+
"TD": 2869158,
|
354 |
+
"TJ": 976,
|
355 |
+
"TN": 10549080,
|
356 |
+
"TR": 459298,
|
357 |
+
"YE": 22114456
|
358 |
+
}
|
359 |
},
|
360 |
{
|
361 |
"language_name": "Urdu",
|
|
|
369 |
],
|
370 |
"bleu": 0.331647033312127,
|
371 |
"commonvoice_hours": 76.0,
|
372 |
+
"commonvoice_locale": "ur",
|
373 |
+
"population": {
|
374 |
+
"CA": 286475,
|
375 |
+
"GB": 2301638,
|
376 |
+
"IN": 66304500,
|
377 |
+
"MU": 71727,
|
378 |
+
"PK": 221825950
|
379 |
+
}
|
380 |
},
|
381 |
{
|
382 |
"language_name": "French",
|
|
|
390 |
],
|
391 |
"bleu": 0.3141809404018014,
|
392 |
"commonvoice_hours": 1051.0,
|
393 |
+
"commonvoice_locale": "fr",
|
394 |
+
"population": {
|
395 |
+
"AD": 5775,
|
396 |
+
"AT": 974540,
|
397 |
+
"BE": 4453866,
|
398 |
+
"BF": 4583788,
|
399 |
+
"BI": 7000822,
|
400 |
+
"BJ": 4502610,
|
401 |
+
"BL": 6837,
|
402 |
+
"CA": 11308230,
|
403 |
+
"CD": 3867640,
|
404 |
+
"CF": 2935521,
|
405 |
+
"CG": 4446179,
|
406 |
+
"CH": 1764838,
|
407 |
+
"CI": 13465739,
|
408 |
+
"CM": 18866600,
|
409 |
+
"CY": 88668,
|
410 |
+
"DE": 14428746,
|
411 |
+
"DJ": 19358,
|
412 |
+
"DZ": 8594580,
|
413 |
+
"FR": 67169718,
|
414 |
+
"GA": 1405473,
|
415 |
+
"GB": 15125053,
|
416 |
+
"GF": 153622,
|
417 |
+
"GN": 3632946,
|
418 |
+
"GP": 407498,
|
419 |
+
"GQ": 73584,
|
420 |
+
"GR": 954639,
|
421 |
+
"HT": 520187,
|
422 |
+
"HU": 293155,
|
423 |
+
"IE": 880017,
|
424 |
+
"IT": 3931370,
|
425 |
+
"KM": 473917,
|
426 |
+
"LB": 20238,
|
427 |
+
"LU": 546691,
|
428 |
+
"MA": 7112340,
|
429 |
+
"MC": 38610,
|
430 |
+
"MF": 32556,
|
431 |
+
"MG": 18599433,
|
432 |
+
"ML": 8994564,
|
433 |
+
"MQ": 427408,
|
434 |
+
"MR": 680932,
|
435 |
+
"MT": 50299,
|
436 |
+
"MU": 41381,
|
437 |
+
"NC": 278409,
|
438 |
+
"NE": 6603996,
|
439 |
+
"NL": 5011316,
|
440 |
+
"PF": 180024,
|
441 |
+
"PM": 5133,
|
442 |
+
"PT": 1545405,
|
443 |
+
"RE": 700950,
|
444 |
+
"RO": 3621493,
|
445 |
+
"RW": 2288,
|
446 |
+
"SC": 57589,
|
447 |
+
"SN": 6137196,
|
448 |
+
"SY": 1144506,
|
449 |
+
"TD": 4388124,
|
450 |
+
"TF": 140,
|
451 |
+
"TG": 5251148,
|
452 |
+
"TN": 8673688,
|
453 |
+
"US": 1862778,
|
454 |
+
"VU": 149166,
|
455 |
+
"WF": 7610,
|
456 |
+
"YT": 110580
|
457 |
+
}
|
458 |
},
|
459 |
{
|
460 |
"language_name": "Bangla",
|
|
|
468 |
],
|
469 |
"bleu": 0.27472181972977344,
|
470 |
"commonvoice_hours": 49.0,
|
471 |
+
"commonvoice_locale": "bn",
|
472 |
+
"population": {
|
473 |
+
"BD": 159397980,
|
474 |
+
"CA": 90466,
|
475 |
+
"GB": 263044,
|
476 |
+
"IN": 107413290,
|
477 |
+
"NP": 28508
|
478 |
+
}
|
479 |
},
|
480 |
{
|
481 |
"language_name": "Portuguese",
|
|
|
509 |
],
|
510 |
"bleu": 0.367787171884892,
|
511 |
"commonvoice_hours": 176.0,
|
512 |
+
"commonvoice_locale": "pt",
|
513 |
+
"population": {
|
514 |
+
"AG": 1571,
|
515 |
+
"AO": 21789941,
|
516 |
+
"BR": 192661560,
|
517 |
+
"CA": 229934,
|
518 |
+
"CH": 285736,
|
519 |
+
"CV": 443274,
|
520 |
+
"FR": 882027,
|
521 |
+
"GB": 131522,
|
522 |
+
"GQ": 1,
|
523 |
+
"GW": 1927100,
|
524 |
+
"LU": 100541,
|
525 |
+
"MO": 30723,
|
526 |
+
"MZ": 8126514,
|
527 |
+
"PT": 9890592,
|
528 |
+
"ST": 179454,
|
529 |
+
"TL": 816395
|
530 |
+
}
|
531 |
},
|
532 |
{
|
533 |
"language_name": "Punjabi",
|
|
|
561 |
],
|
562 |
"bleu": 0.31594664710428266,
|
563 |
"commonvoice_hours": 2.3,
|
564 |
+
"commonvoice_locale": "pa-IN",
|
565 |
+
"population": {
|
566 |
+
"CA": 603106,
|
567 |
+
"GB": 2367400,
|
568 |
+
"IN": 37130520,
|
569 |
+
"KE": 10170,
|
570 |
+
"PK": 163450700,
|
571 |
+
"SG": 9314
|
572 |
+
}
|
573 |
},
|
574 |
{
|
575 |
"language_name": "Russian",
|
|
|
583 |
],
|
584 |
"bleu": 0.2920291935463745,
|
585 |
"commonvoice_hours": 241.0,
|
586 |
+
"commonvoice_locale": "ru",
|
587 |
+
"population": {
|
588 |
+
"BG": 1602387,
|
589 |
+
"BY": 1137350,
|
590 |
+
"CA": 211087,
|
591 |
+
"CN": 13940,
|
592 |
+
"DE": 4809582,
|
593 |
+
"EE": 688027,
|
594 |
+
"FI": 45131,
|
595 |
+
"GE": 359730,
|
596 |
+
"IL": 954303,
|
597 |
+
"KG": 2147364,
|
598 |
+
"KZ": 13746168,
|
599 |
+
"LT": 2185168,
|
600 |
+
"LV": 714867,
|
601 |
+
"MD": 100935,
|
602 |
+
"MN": 4118,
|
603 |
+
"PL": 6890814,
|
604 |
+
"RU": 133218680,
|
605 |
+
"SJ": 1200,
|
606 |
+
"TJ": 1064840,
|
607 |
+
"TM": 663436,
|
608 |
+
"UA": 20204534,
|
609 |
+
"US": 798334,
|
610 |
+
"UZ": 4279156
|
611 |
+
}
|
612 |
},
|
613 |
{
|
614 |
"language_name": "Swahili",
|
|
|
642 |
],
|
643 |
"bleu": 0.3018786362743097,
|
644 |
"commonvoice_hours": 411.0,
|
645 |
+
"commonvoice_locale": "sw",
|
646 |
+
"population": {
|
647 |
+
"BI": 6408,
|
648 |
+
"CD": 50890000,
|
649 |
+
"KE": 35328414,
|
650 |
+
"MZ": 9330,
|
651 |
+
"SO": 235142,
|
652 |
+
"TZ": 52697520,
|
653 |
+
"UG": 32439750,
|
654 |
+
"YT": 2716,
|
655 |
+
"ZA": 1016
|
656 |
+
}
|
657 |
},
|
658 |
{
|
659 |
"language_name": "Indonesian",
|
|
|
687 |
],
|
688 |
"bleu": 0.31132422822400946,
|
689 |
"commonvoice_hours": 33.0,
|
690 |
+
"commonvoice_locale": "id",
|
691 |
+
"population": {
|
692 |
+
"ID": 170896640,
|
693 |
+
"NL": 311047
|
694 |
+
}
|
695 |
},
|
696 |
{
|
697 |
"language_name": "German",
|
|
|
725 |
],
|
726 |
"bleu": 0.3992689214831344,
|
727 |
"commonvoice_hours": 1357.0,
|
728 |
+
"commonvoice_locale": "de",
|
729 |
+
"population": {
|
730 |
+
"AT": 8593666,
|
731 |
+
"BE": 2578554,
|
732 |
+
"BG": 557352,
|
733 |
+
"BR": 1778414,
|
734 |
+
"CA": 294014,
|
735 |
+
"CH": 6134913,
|
736 |
+
"CZ": 1605375,
|
737 |
+
"DE": 72945327,
|
738 |
+
"DK": 2758623,
|
739 |
+
"FI": 1002901,
|
740 |
+
"FR": 3392410,
|
741 |
+
"GB": 5918499,
|
742 |
+
"GR": 530355,
|
743 |
+
"HU": 1758929,
|
744 |
+
"IT": 998443,
|
745 |
+
"KZ": 1221882,
|
746 |
+
"LI": 39137,
|
747 |
+
"LT": 382404,
|
748 |
+
"LU": 395880,
|
749 |
+
"NA": 23671,
|
750 |
+
"NL": 12269084,
|
751 |
+
"PL": 7273637,
|
752 |
+
"PY": 208559,
|
753 |
+
"RO": 44736,
|
754 |
+
"SI": 883126,
|
755 |
+
"SK": 1196932,
|
756 |
+
"US": 1563403
|
757 |
+
}
|
758 |
},
|
759 |
{
|
760 |
"language_name": "Japanese",
|
|
|
768 |
],
|
769 |
"bleu": 0.2954810072264808,
|
770 |
"commonvoice_hours": 222.0,
|
771 |
+
"commonvoice_locale": "ja",
|
772 |
+
"population": {
|
773 |
+
"BR": 444604,
|
774 |
+
"CA": 52772,
|
775 |
+
"JP": 119231650
|
776 |
+
}
|
777 |
},
|
778 |
{
|
779 |
"language_name": "Telugu",
|
|
|
787 |
],
|
788 |
"bleu": 0.37949545228579734,
|
789 |
"commonvoice_hours": 0.3,
|
790 |
+
"commonvoice_locale": "te",
|
791 |
+
"population": {
|
792 |
+
"IN": 95478480
|
793 |
+
}
|
794 |
},
|
795 |
{
|
796 |
"language_name": "Marathi",
|
|
|
804 |
],
|
805 |
"bleu": 0.2852384896861461,
|
806 |
"commonvoice_hours": 20.0,
|
807 |
+
"commonvoice_locale": "mr",
|
808 |
+
"population": {
|
809 |
+
"IN": 92826300
|
810 |
+
}
|
811 |
},
|
812 |
{
|
813 |
"language_name": "Javanese",
|
|
|
841 |
],
|
842 |
"bleu": 0.2505244065073906,
|
843 |
"commonvoice_hours": 0.0,
|
844 |
+
"commonvoice_locale": "jv",
|
845 |
+
"population": {
|
846 |
+
"ID": 90788840,
|
847 |
+
"MY": 391825
|
848 |
+
}
|
849 |
},
|
850 |
{
|
851 |
"language_name": "Vietnamese",
|
|
|
859 |
],
|
860 |
"bleu": 0.2956750563565745,
|
861 |
"commonvoice_hours": 5.9,
|
862 |
+
"commonvoice_locale": "vi",
|
863 |
+
"population": {
|
864 |
+
"CA": 184701,
|
865 |
+
"CN": 6970,
|
866 |
+
"US": 1130973,
|
867 |
+
"VN": 84900318
|
868 |
+
}
|
869 |
},
|
870 |
{
|
871 |
"language_name": "Tamil",
|
|
|
879 |
],
|
880 |
"bleu": 0.27547489589987734,
|
881 |
"commonvoice_hours": 234.0,
|
882 |
+
"commonvoice_locale": "ta",
|
883 |
+
"population": {
|
884 |
+
"CA": 184701,
|
885 |
+
"GB": 2104355,
|
886 |
+
"IN": 78239310,
|
887 |
+
"LK": 3433380,
|
888 |
+
"MU": 34484,
|
889 |
+
"MY": 1371388,
|
890 |
+
"RE": 118138,
|
891 |
+
"SG": 130403
|
892 |
+
}
|
893 |
},
|
894 |
{
|
895 |
"language_name": "Persian",
|
|
|
903 |
],
|
904 |
"bleu": 0.2858012364771329,
|
905 |
"commonvoice_hours": 370.0,
|
906 |
+
"commonvoice_locale": "fa",
|
907 |
+
"population": {
|
908 |
+
"AE": 189850,
|
909 |
+
"AF": 18321900,
|
910 |
+
"CA": 245012,
|
911 |
+
"IQ": 338192,
|
912 |
+
"IR": 63692475,
|
913 |
+
"OM": 43849,
|
914 |
+
"PK": 1541107,
|
915 |
+
"QA": 268859,
|
916 |
+
"TJ": 69215
|
917 |
+
}
|
918 |
},
|
919 |
{
|
920 |
"language_name": "Turkish",
|
|
|
948 |
],
|
949 |
"bleu": 0.30402386618673855,
|
950 |
"commonvoice_hours": 127.0,
|
951 |
+
"commonvoice_locale": "tr",
|
952 |
+
"population": {
|
953 |
+
"BG": 766359,
|
954 |
+
"CA": 37694,
|
955 |
+
"CY": 291336,
|
956 |
+
"DE": 2003992,
|
957 |
+
"GB": 131522,
|
958 |
+
"GR": 127285,
|
959 |
+
"MK": 74409,
|
960 |
+
"NL": 207365,
|
961 |
+
"RO": 27694,
|
962 |
+
"TR": 76276275,
|
963 |
+
"UA": 184476,
|
964 |
+
"UZ": 232297
|
965 |
+
}
|
966 |
},
|
967 |
{
|
968 |
"language_name": "Cantonese",
|
|
|
996 |
],
|
997 |
"bleu": 0.27975991005230577,
|
998 |
"commonvoice_hours": 203.0,
|
999 |
+
"commonvoice_locale": "yue",
|
1000 |
+
"population": {
|
1001 |
+
"CA": 640800,
|
1002 |
+
"CN": 72489040,
|
1003 |
+
"HK": 6524919
|
1004 |
+
}
|
1005 |
},
|
1006 |
{
|
1007 |
"language_name": "Korean",
|
|
|
1015 |
],
|
1016 |
"bleu": 0.24501349273295708,
|
1017 |
"commonvoice_hours": 1.7,
|
1018 |
+
"commonvoice_locale": "ko",
|
1019 |
+
"population": {
|
1020 |
+
"BR": 44460,
|
1021 |
+
"CA": 169623,
|
1022 |
+
"CN": 2091030,
|
1023 |
+
"JP": 652636,
|
1024 |
+
"KP": 22566280,
|
1025 |
+
"KR": 51835100,
|
1026 |
+
"US": 997917
|
1027 |
+
}
|
1028 |
},
|
1029 |
{
|
1030 |
"language_name": "Italian",
|
|
|
1038 |
],
|
1039 |
"bleu": 0.3273249067267197,
|
1040 |
"commonvoice_hours": 362.0,
|
1041 |
+
"commonvoice_locale": "it",
|
1042 |
+
"population": {
|
1043 |
+
"AT": 797350,
|
1044 |
+
"AU": 483864,
|
1045 |
+
"BR": 592805,
|
1046 |
+
"CA": 343016,
|
1047 |
+
"CH": 361372,
|
1048 |
+
"DE": 5611179,
|
1049 |
+
"FR": 1153419,
|
1050 |
+
"GB": 131522,
|
1051 |
+
"HR": 67644,
|
1052 |
+
"IT": 59282565,
|
1053 |
+
"MT": 256070,
|
1054 |
+
"SI": 3995,
|
1055 |
+
"SM": 30466,
|
1056 |
+
"US": 1130973,
|
1057 |
+
"VA": 820
|
1058 |
+
}
|
1059 |
},
|
1060 |
{
|
1061 |
"language_name": "Filipino",
|
|
|
1089 |
],
|
1090 |
"bleu": 0.3353425581350746,
|
1091 |
"commonvoice_hours": 0.0,
|
1092 |
+
"commonvoice_locale": "tl",
|
1093 |
+
"population": {
|
1094 |
+
"CA": 565412,
|
1095 |
+
"PH": 65508600,
|
1096 |
+
"US": 1397084
|
1097 |
+
}
|
1098 |
},
|
1099 |
{
|
1100 |
"language_name": "Egyptian Arabic",
|
|
|
1107 |
}
|
1108 |
],
|
1109 |
"bleu": 0.23431638822117362,
|
1110 |
+
"commonvoice_hours": null,
|
1111 |
+
"commonvoice_locale": null,
|
1112 |
+
"population": {
|
1113 |
+
"EG": 66639360
|
1114 |
+
}
|
1115 |
},
|
1116 |
{
|
1117 |
"language_name": "Gujarati",
|
|
|
1125 |
],
|
1126 |
"bleu": 0.27834507803114356,
|
1127 |
"commonvoice_hours": 0.0,
|
1128 |
+
"commonvoice_locale": "gu-IN",
|
1129 |
+
"population": {
|
1130 |
+
"CA": 135699,
|
1131 |
+
"GB": 1907072,
|
1132 |
+
"IN": 59674050,
|
1133 |
+
"KE": 4978
|
1134 |
+
}
|
1135 |
}
|
1136 |
]
|
uv.lock
CHANGED
@@ -930,6 +930,7 @@ dependencies = [
|
|
930 |
{ name = "gradio" },
|
931 |
{ name = "pandas" },
|
932 |
{ name = "plotly" },
|
|
|
933 |
]
|
934 |
|
935 |
[package.dev-dependencies]
|
@@ -954,6 +955,7 @@ requires-dist = [
|
|
954 |
{ name = "gradio", specifier = ">=5.16.2" },
|
955 |
{ name = "pandas", specifier = ">=2.2.3" },
|
956 |
{ name = "plotly", specifier = ">=6.0.0" },
|
|
|
957 |
]
|
958 |
|
959 |
[package.metadata.requires-dev]
|
@@ -1871,6 +1873,15 @@ wheels = [
|
|
1871 |
{ url = "https://files.pythonhosted.org/packages/92/a2/81c1dd744b322c0c548f793deb521bf23500806d754128ddf6f978736dff/pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420", size = 40006508 },
|
1872 |
]
|
1873 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1874 |
[[package]]
|
1875 |
name = "pydantic"
|
1876 |
version = "2.9.2"
|
|
|
930 |
{ name = "gradio" },
|
931 |
{ name = "pandas" },
|
932 |
{ name = "plotly" },
|
933 |
+
{ name = "pycountry" },
|
934 |
]
|
935 |
|
936 |
[package.dev-dependencies]
|
|
|
955 |
{ name = "gradio", specifier = ">=5.16.2" },
|
956 |
{ name = "pandas", specifier = ">=2.2.3" },
|
957 |
{ name = "plotly", specifier = ">=6.0.0" },
|
958 |
+
{ name = "pycountry" },
|
959 |
]
|
960 |
|
961 |
[package.metadata.requires-dev]
|
|
|
1873 |
{ url = "https://files.pythonhosted.org/packages/92/a2/81c1dd744b322c0c548f793deb521bf23500806d754128ddf6f978736dff/pyarrow-18.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b46591222c864e7da7faa3b19455196416cd8355ff6c2cc2e65726a760a3c420", size = 40006508 },
|
1874 |
]
|
1875 |
|
1876 |
+
[[package]]
|
1877 |
+
name = "pycountry"
|
1878 |
+
version = "24.6.1"
|
1879 |
+
source = { registry = "https://pypi.org/simple" }
|
1880 |
+
sdist = { url = "https://files.pythonhosted.org/packages/76/57/c389fa68c50590881a75b7883eeb3dc15e9e73a0fdc001cdd45c13290c92/pycountry-24.6.1.tar.gz", hash = "sha256:b61b3faccea67f87d10c1f2b0fc0be714409e8fcdcc1315613174f6466c10221", size = 6043910 }
|
1881 |
+
wheels = [
|
1882 |
+
{ url = "https://files.pythonhosted.org/packages/b1/ec/1fb891d8a2660716aadb2143235481d15ed1cbfe3ad669194690b0604492/pycountry-24.6.1-py3-none-any.whl", hash = "sha256:f1a4fb391cd7214f8eefd39556d740adcc233c778a27f8942c8dca351d6ce06f", size = 6335189 },
|
1883 |
+
]
|
1884 |
+
|
1885 |
[[package]]
|
1886 |
name = "pydantic"
|
1887 |
version = "2.9.2"
|