David Pomerenke
commited on
Commit
·
1167b2d
1
Parent(s):
7fc657e
Show classification and overall score in app
Browse files- app.py +17 -17
- evals.py +7 -2
- results.json +40 -15
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import json
|
2 |
|
3 |
import gradio as gr
|
4 |
-
import numpy as np
|
5 |
import pandas as pd
|
6 |
import plotly.graph_objects as go
|
7 |
import plotly.express as px
|
@@ -160,7 +159,7 @@ def create_language_stats_df(results):
|
|
160 |
for lang in results:
|
161 |
# Find the best model and its BLEU score
|
162 |
best_score = max(
|
163 |
-
lang["scores"] or [{"
|
164 |
)
|
165 |
|
166 |
model = best_score["model"]
|
@@ -178,18 +177,18 @@ def create_language_stats_df(results):
|
|
178 |
row = {
|
179 |
"Language": f"**{lang['language_name']}**",
|
180 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
181 |
-
"Models Tested": len(lang["scores"]),
|
182 |
-
"
|
183 |
-
if lang["
|
184 |
else "N/A",
|
185 |
-
"
|
186 |
-
|
187 |
-
if best_score["bleu"] is not None
|
188 |
else "N/A",
|
189 |
-
"
|
190 |
-
"Accuracy": round(lang["accuracy"], 3)
|
191 |
if lang["accuracy"] is not None
|
192 |
else "N/A",
|
|
|
|
|
193 |
}
|
194 |
flat_data.append(row)
|
195 |
|
@@ -199,13 +198,14 @@ def create_language_stats_df(results):
|
|
199 |
label="Language Results",
|
200 |
show_search="search",
|
201 |
datatype=[
|
202 |
-
"markdown",
|
203 |
-
"number",
|
204 |
-
"number",
|
205 |
-
"number",
|
206 |
-
"
|
207 |
-
"number",
|
208 |
-
"markdown",
|
|
|
209 |
],
|
210 |
)
|
211 |
|
|
|
1 |
import json
|
2 |
|
3 |
import gradio as gr
|
|
|
4 |
import pandas as pd
|
5 |
import plotly.graph_objects as go
|
6 |
import plotly.express as px
|
|
|
159 |
for lang in results:
|
160 |
# Find the best model and its BLEU score
|
161 |
best_score = max(
|
162 |
+
lang["scores"] or [{"overall_score": None, "model": None}], key=lambda x: x["overall_score"]
|
163 |
)
|
164 |
|
165 |
model = best_score["model"]
|
|
|
177 |
row = {
|
178 |
"Language": f"**{lang['language_name']}**",
|
179 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
180 |
+
# "Models Tested": len(lang["scores"]),
|
181 |
+
"Overall": round(lang["overall_score"], 3)
|
182 |
+
if lang["overall_score"] is not None
|
183 |
else "N/A",
|
184 |
+
"Trans-lation": round(lang["bleu"], 3)
|
185 |
+
if lang["bleu"] is not None
|
|
|
186 |
else "N/A",
|
187 |
+
"Classi-fication": round(lang["accuracy"], 3)
|
|
|
188 |
if lang["accuracy"] is not None
|
189 |
else "N/A",
|
190 |
+
"Best Model": model_link,
|
191 |
+
"CommonVoice Hours": commonvoice_link,
|
192 |
}
|
193 |
flat_data.append(row)
|
194 |
|
|
|
198 |
label="Language Results",
|
199 |
show_search="search",
|
200 |
datatype=[
|
201 |
+
"markdown", # Language
|
202 |
+
"number", # Speakers
|
203 |
+
# "number", # Models Tested
|
204 |
+
"number", # Overall
|
205 |
+
"number", # Translation
|
206 |
+
"number", # Classification
|
207 |
+
"markdown", # Best Model
|
208 |
+
"markdown", # CommonVoice Hours
|
209 |
],
|
210 |
)
|
211 |
|
evals.py
CHANGED
@@ -316,14 +316,18 @@ async def main():
|
|
316 |
for score in classification_scores
|
317 |
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
318 |
]
|
|
|
|
|
319 |
accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
|
|
|
320 |
if translations_for_model:
|
321 |
results_for_language.append(
|
322 |
{
|
323 |
"model": model,
|
324 |
-
"bleu":
|
325 |
-
"chrf":
|
326 |
"accuracy": accuracy,
|
|
|
327 |
}
|
328 |
)
|
329 |
if results_for_language:
|
@@ -336,6 +340,7 @@ async def main():
|
|
336 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
337 |
"chrf": mean([s["chrf"] for s in results_for_language]),
|
338 |
"accuracy": mean([s["accuracy"] for s in results_for_language]),
|
|
|
339 |
"commonvoice_hours": language.commonvoice_hours
|
340 |
if not pd.isna(language.commonvoice_hours)
|
341 |
else None,
|
|
|
316 |
for score in classification_scores
|
317 |
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
318 |
]
|
319 |
+
bleu = mean([s["bleu"] for s in translations_for_model])
|
320 |
+
chrf = mean([s["chrf"] for s in translations_for_model])
|
321 |
accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
|
322 |
+
overall_score = (bleu + accuracy) / 2
|
323 |
if translations_for_model:
|
324 |
results_for_language.append(
|
325 |
{
|
326 |
"model": model,
|
327 |
+
"bleu": bleu,
|
328 |
+
"chrf": chrf,
|
329 |
"accuracy": accuracy,
|
330 |
+
"overall_score": overall_score,
|
331 |
}
|
332 |
)
|
333 |
if results_for_language:
|
|
|
340 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
341 |
"chrf": mean([s["chrf"] for s in results_for_language]),
|
342 |
"accuracy": mean([s["accuracy"] for s in results_for_language]),
|
343 |
+
"overall_score": mean([s["overall_score"] for s in results_for_language]),
|
344 |
"commonvoice_hours": language.commonvoice_hours
|
345 |
if not pd.isna(language.commonvoice_hours)
|
346 |
else None,
|
results.json
CHANGED
@@ -8,12 +8,14 @@
|
|
8 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
9 |
"bleu": 0.4351349353198866,
|
10 |
"chrf": 54.9504915580248,
|
11 |
-
"accuracy": 1.0
|
|
|
12 |
}
|
13 |
],
|
14 |
"bleu": 0.4351349353198866,
|
15 |
"chrf": 54.9504915580248,
|
16 |
"accuracy": 1.0,
|
|
|
17 |
"commonvoice_hours": 2651.0,
|
18 |
"commonvoice_locale": "en",
|
19 |
"population": {
|
@@ -183,12 +185,14 @@
|
|
183 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
184 |
"bleu": 0.3977775857451761,
|
185 |
"chrf": 57.672913792439125,
|
186 |
-
"accuracy": 1.0
|
|
|
187 |
}
|
188 |
],
|
189 |
"bleu": 0.3977775857451761,
|
190 |
"chrf": 57.672913792439125,
|
191 |
"accuracy": 1.0,
|
|
|
192 |
"commonvoice_hours": 422.0,
|
193 |
"commonvoice_locale": "zh-TW",
|
194 |
"population": {
|
@@ -223,12 +227,14 @@
|
|
223 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
224 |
"bleu": 0.333521621016373,
|
225 |
"chrf": 50.48364584189306,
|
226 |
-
"accuracy": 0.9333333333333333
|
|
|
227 |
}
|
228 |
],
|
229 |
"bleu": 0.333521621016373,
|
230 |
"chrf": 50.48364584189306,
|
231 |
"accuracy": 0.9333333333333333,
|
|
|
232 |
"commonvoice_hours": 16.0,
|
233 |
"commonvoice_locale": "hi-IN",
|
234 |
"population": {
|
@@ -249,12 +255,14 @@
|
|
249 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
250 |
"bleu": 0.29160032861883095,
|
251 |
"chrf": 47.668399832701844,
|
252 |
-
"accuracy": 0.9666666666666667
|
|
|
253 |
}
|
254 |
],
|
255 |
"bleu": 0.29160032861883095,
|
256 |
"chrf": 47.668399832701844,
|
257 |
"accuracy": 0.9666666666666667,
|
|
|
258 |
"commonvoice_hours": 446.0,
|
259 |
"commonvoice_locale": "es",
|
260 |
"population": {
|
@@ -308,12 +316,14 @@
|
|
308 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
309 |
"bleu": 0.277257629790728,
|
310 |
"chrf": 46.62779335380641,
|
311 |
-
"accuracy": 0.9333333333333333
|
|
|
312 |
}
|
313 |
],
|
314 |
"bleu": 0.277257629790728,
|
315 |
"chrf": 46.62779335380641,
|
316 |
"accuracy": 0.9333333333333333,
|
|
|
317 |
"commonvoice_hours": 91.0,
|
318 |
"commonvoice_locale": "ar",
|
319 |
"population": {
|
@@ -366,12 +376,14 @@
|
|
366 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
367 |
"bleu": 0.2659144372728079,
|
368 |
"chrf": 44.14831240898717,
|
369 |
-
"accuracy": 0.8333333333333334
|
|
|
370 |
}
|
371 |
],
|
372 |
"bleu": 0.2659144372728079,
|
373 |
"chrf": 44.14831240898717,
|
374 |
"accuracy": 0.8333333333333334,
|
|
|
375 |
"commonvoice_hours": 77.0,
|
376 |
"commonvoice_locale": "ur",
|
377 |
"population": {
|
@@ -391,12 +403,14 @@
|
|
391 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
392 |
"bleu": 0.315663773358301,
|
393 |
"chrf": 49.253978669350964,
|
394 |
-
"accuracy": 0.9666666666666667
|
|
|
395 |
}
|
396 |
],
|
397 |
"bleu": 0.315663773358301,
|
398 |
"chrf": 49.253978669350964,
|
399 |
"accuracy": 0.9666666666666667,
|
|
|
400 |
"commonvoice_hours": 1052.0,
|
401 |
"commonvoice_locale": "fr",
|
402 |
"population": {
|
@@ -473,12 +487,14 @@
|
|
473 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
474 |
"bleu": 0.21265887286151353,
|
475 |
"chrf": 41.501657722373686,
|
476 |
-
"accuracy": 0.9333333333333333
|
|
|
477 |
}
|
478 |
],
|
479 |
"bleu": 0.21265887286151353,
|
480 |
"chrf": 41.501657722373686,
|
481 |
"accuracy": 0.9333333333333333,
|
|
|
482 |
"commonvoice_hours": 49.0,
|
483 |
"commonvoice_locale": "bn",
|
484 |
"population": {
|
@@ -498,42 +514,49 @@
|
|
498 |
"model": "openai/gpt-4o-mini",
|
499 |
"bleu": 0.37370265193281843,
|
500 |
"chrf": 57.010201314973216,
|
501 |
-
"accuracy": 0.9666666666666667
|
|
|
502 |
},
|
503 |
{
|
504 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
505 |
"bleu": 0.27514792195783394,
|
506 |
"chrf": 45.901248962808694,
|
507 |
-
"accuracy": 0.9666666666666667
|
|
|
508 |
},
|
509 |
{
|
510 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
511 |
"bleu": 0.3691905380990064,
|
512 |
"chrf": 54.842418095352954,
|
513 |
-
"accuracy": 0.9666666666666667
|
|
|
514 |
},
|
515 |
{
|
516 |
"model": "google/gemini-2.0-flash-001",
|
517 |
"bleu": 0.4020145367576223,
|
518 |
"chrf": 60.73156386707501,
|
519 |
-
"accuracy": 0.9
|
|
|
520 |
},
|
521 |
{
|
522 |
"model": "deepseek/deepseek-chat",
|
523 |
"bleu": 0.39831859400698993,
|
524 |
"chrf": 59.99225659809846,
|
525 |
-
"accuracy": 0.9666666666666667
|
|
|
526 |
},
|
527 |
{
|
528 |
"model": "microsoft/phi-4",
|
529 |
"bleu": 0.35576182901107084,
|
530 |
"chrf": 56.05856754270042,
|
531 |
-
"accuracy": 0.9
|
|
|
532 |
}
|
533 |
],
|
534 |
"bleu": 0.36235601196089035,
|
535 |
"chrf": 55.756042730168126,
|
536 |
"accuracy": 0.9444444444444445,
|
|
|
537 |
"commonvoice_hours": 177.0,
|
538 |
"commonvoice_locale": "pt",
|
539 |
"population": {
|
@@ -564,12 +587,14 @@
|
|
564 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
565 |
"bleu": 0.3048037308116852,
|
566 |
"chrf": 48.4304965568793,
|
567 |
-
"accuracy": 0.9666666666666667
|
|
|
568 |
}
|
569 |
],
|
570 |
"bleu": 0.3048037308116852,
|
571 |
"chrf": 48.4304965568793,
|
572 |
"accuracy": 0.9666666666666667,
|
|
|
573 |
"commonvoice_hours": 2.3,
|
574 |
"commonvoice_locale": "pa-IN",
|
575 |
"population": {
|
|
|
8 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
9 |
"bleu": 0.4351349353198866,
|
10 |
"chrf": 54.9504915580248,
|
11 |
+
"accuracy": 1.0,
|
12 |
+
"overall_score": 0.7175674676599433
|
13 |
}
|
14 |
],
|
15 |
"bleu": 0.4351349353198866,
|
16 |
"chrf": 54.9504915580248,
|
17 |
"accuracy": 1.0,
|
18 |
+
"overall_score": 0.7175674676599433,
|
19 |
"commonvoice_hours": 2651.0,
|
20 |
"commonvoice_locale": "en",
|
21 |
"population": {
|
|
|
185 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
186 |
"bleu": 0.3977775857451761,
|
187 |
"chrf": 57.672913792439125,
|
188 |
+
"accuracy": 1.0,
|
189 |
+
"overall_score": 0.698888792872588
|
190 |
}
|
191 |
],
|
192 |
"bleu": 0.3977775857451761,
|
193 |
"chrf": 57.672913792439125,
|
194 |
"accuracy": 1.0,
|
195 |
+
"overall_score": 0.698888792872588,
|
196 |
"commonvoice_hours": 422.0,
|
197 |
"commonvoice_locale": "zh-TW",
|
198 |
"population": {
|
|
|
227 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
228 |
"bleu": 0.333521621016373,
|
229 |
"chrf": 50.48364584189306,
|
230 |
+
"accuracy": 0.9333333333333333,
|
231 |
+
"overall_score": 0.6334274771748531
|
232 |
}
|
233 |
],
|
234 |
"bleu": 0.333521621016373,
|
235 |
"chrf": 50.48364584189306,
|
236 |
"accuracy": 0.9333333333333333,
|
237 |
+
"overall_score": 0.6334274771748531,
|
238 |
"commonvoice_hours": 16.0,
|
239 |
"commonvoice_locale": "hi-IN",
|
240 |
"population": {
|
|
|
255 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
256 |
"bleu": 0.29160032861883095,
|
257 |
"chrf": 47.668399832701844,
|
258 |
+
"accuracy": 0.9666666666666667,
|
259 |
+
"overall_score": 0.6291334976427488
|
260 |
}
|
261 |
],
|
262 |
"bleu": 0.29160032861883095,
|
263 |
"chrf": 47.668399832701844,
|
264 |
"accuracy": 0.9666666666666667,
|
265 |
+
"overall_score": 0.6291334976427488,
|
266 |
"commonvoice_hours": 446.0,
|
267 |
"commonvoice_locale": "es",
|
268 |
"population": {
|
|
|
316 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
317 |
"bleu": 0.277257629790728,
|
318 |
"chrf": 46.62779335380641,
|
319 |
+
"accuracy": 0.9333333333333333,
|
320 |
+
"overall_score": 0.6052954815620306
|
321 |
}
|
322 |
],
|
323 |
"bleu": 0.277257629790728,
|
324 |
"chrf": 46.62779335380641,
|
325 |
"accuracy": 0.9333333333333333,
|
326 |
+
"overall_score": 0.6052954815620306,
|
327 |
"commonvoice_hours": 91.0,
|
328 |
"commonvoice_locale": "ar",
|
329 |
"population": {
|
|
|
376 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
377 |
"bleu": 0.2659144372728079,
|
378 |
"chrf": 44.14831240898717,
|
379 |
+
"accuracy": 0.8333333333333334,
|
380 |
+
"overall_score": 0.5496238853030706
|
381 |
}
|
382 |
],
|
383 |
"bleu": 0.2659144372728079,
|
384 |
"chrf": 44.14831240898717,
|
385 |
"accuracy": 0.8333333333333334,
|
386 |
+
"overall_score": 0.5496238853030706,
|
387 |
"commonvoice_hours": 77.0,
|
388 |
"commonvoice_locale": "ur",
|
389 |
"population": {
|
|
|
403 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
404 |
"bleu": 0.315663773358301,
|
405 |
"chrf": 49.253978669350964,
|
406 |
+
"accuracy": 0.9666666666666667,
|
407 |
+
"overall_score": 0.6411652200124838
|
408 |
}
|
409 |
],
|
410 |
"bleu": 0.315663773358301,
|
411 |
"chrf": 49.253978669350964,
|
412 |
"accuracy": 0.9666666666666667,
|
413 |
+
"overall_score": 0.6411652200124838,
|
414 |
"commonvoice_hours": 1052.0,
|
415 |
"commonvoice_locale": "fr",
|
416 |
"population": {
|
|
|
487 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
488 |
"bleu": 0.21265887286151353,
|
489 |
"chrf": 41.501657722373686,
|
490 |
+
"accuracy": 0.9333333333333333,
|
491 |
+
"overall_score": 0.5729961030974234
|
492 |
}
|
493 |
],
|
494 |
"bleu": 0.21265887286151353,
|
495 |
"chrf": 41.501657722373686,
|
496 |
"accuracy": 0.9333333333333333,
|
497 |
+
"overall_score": 0.5729961030974234,
|
498 |
"commonvoice_hours": 49.0,
|
499 |
"commonvoice_locale": "bn",
|
500 |
"population": {
|
|
|
514 |
"model": "openai/gpt-4o-mini",
|
515 |
"bleu": 0.37370265193281843,
|
516 |
"chrf": 57.010201314973216,
|
517 |
+
"accuracy": 0.9666666666666667,
|
518 |
+
"overall_score": 0.6701846592997426
|
519 |
},
|
520 |
{
|
521 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
522 |
"bleu": 0.27514792195783394,
|
523 |
"chrf": 45.901248962808694,
|
524 |
+
"accuracy": 0.9666666666666667,
|
525 |
+
"overall_score": 0.6209072943122503
|
526 |
},
|
527 |
{
|
528 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
529 |
"bleu": 0.3691905380990064,
|
530 |
"chrf": 54.842418095352954,
|
531 |
+
"accuracy": 0.9666666666666667,
|
532 |
+
"overall_score": 0.6679286023828366
|
533 |
},
|
534 |
{
|
535 |
"model": "google/gemini-2.0-flash-001",
|
536 |
"bleu": 0.4020145367576223,
|
537 |
"chrf": 60.73156386707501,
|
538 |
+
"accuracy": 0.9,
|
539 |
+
"overall_score": 0.6510072683788112
|
540 |
},
|
541 |
{
|
542 |
"model": "deepseek/deepseek-chat",
|
543 |
"bleu": 0.39831859400698993,
|
544 |
"chrf": 59.99225659809846,
|
545 |
+
"accuracy": 0.9666666666666667,
|
546 |
+
"overall_score": 0.6824926303368283
|
547 |
},
|
548 |
{
|
549 |
"model": "microsoft/phi-4",
|
550 |
"bleu": 0.35576182901107084,
|
551 |
"chrf": 56.05856754270042,
|
552 |
+
"accuracy": 0.9,
|
553 |
+
"overall_score": 0.6278809145055354
|
554 |
}
|
555 |
],
|
556 |
"bleu": 0.36235601196089035,
|
557 |
"chrf": 55.756042730168126,
|
558 |
"accuracy": 0.9444444444444445,
|
559 |
+
"overall_score": 0.6534002282026674,
|
560 |
"commonvoice_hours": 177.0,
|
561 |
"commonvoice_locale": "pt",
|
562 |
"population": {
|
|
|
587 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
588 |
"bleu": 0.3048037308116852,
|
589 |
"chrf": 48.4304965568793,
|
590 |
+
"accuracy": 0.9666666666666667,
|
591 |
+
"overall_score": 0.6357351987391759
|
592 |
}
|
593 |
],
|
594 |
"bleu": 0.3048037308116852,
|
595 |
"chrf": 48.4304965568793,
|
596 |
"accuracy": 0.9666666666666667,
|
597 |
+
"overall_score": 0.6357351987391759,
|
598 |
"commonvoice_hours": 2.3,
|
599 |
"commonvoice_locale": "pa-IN",
|
600 |
"population": {
|