David Pomerenke commited on
Commit
1167b2d
·
1 Parent(s): 7fc657e

Show classification and overall score in app

Browse files
Files changed (3) hide show
  1. app.py +17 -17
  2. evals.py +7 -2
  3. results.json +40 -15
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import json
2
 
3
  import gradio as gr
4
- import numpy as np
5
  import pandas as pd
6
  import plotly.graph_objects as go
7
  import plotly.express as px
@@ -160,7 +159,7 @@ def create_language_stats_df(results):
160
  for lang in results:
161
  # Find the best model and its BLEU score
162
  best_score = max(
163
- lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
164
  )
165
 
166
  model = best_score["model"]
@@ -178,18 +177,18 @@ def create_language_stats_df(results):
178
  row = {
179
  "Language": f"**{lang['language_name']}**",
180
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
181
- "Models Tested": len(lang["scores"]),
182
- "Average BLEU": round(lang["bleu"], 3)
183
- if lang["bleu"] is not None
184
  else "N/A",
185
- "Best Model": model_link,
186
- "Best Model BLEU": round(best_score["bleu"], 3)
187
- if best_score["bleu"] is not None
188
  else "N/A",
189
- "CommonVoice Hours": commonvoice_link,
190
- "Accuracy": round(lang["accuracy"], 3)
191
  if lang["accuracy"] is not None
192
  else "N/A",
 
 
193
  }
194
  flat_data.append(row)
195
 
@@ -199,13 +198,14 @@ def create_language_stats_df(results):
199
  label="Language Results",
200
  show_search="search",
201
  datatype=[
202
- "markdown",
203
- "number",
204
- "number",
205
- "number",
206
- "markdown",
207
- "number",
208
- "markdown",
 
209
  ],
210
  )
211
 
 
1
  import json
2
 
3
  import gradio as gr
 
4
  import pandas as pd
5
  import plotly.graph_objects as go
6
  import plotly.express as px
 
159
  for lang in results:
160
  # Find the best model and its BLEU score
161
  best_score = max(
162
+ lang["scores"] or [{"overall_score": None, "model": None}], key=lambda x: x["overall_score"]
163
  )
164
 
165
  model = best_score["model"]
 
177
  row = {
178
  "Language": f"**{lang['language_name']}**",
179
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
180
+ # "Models Tested": len(lang["scores"]),
181
+ "Overall": round(lang["overall_score"], 3)
182
+ if lang["overall_score"] is not None
183
  else "N/A",
184
+ "Trans-lation": round(lang["bleu"], 3)
185
+ if lang["bleu"] is not None
 
186
  else "N/A",
187
+ "Classi-fication": round(lang["accuracy"], 3)
 
188
  if lang["accuracy"] is not None
189
  else "N/A",
190
+ "Best Model": model_link,
191
+ "CommonVoice Hours": commonvoice_link,
192
  }
193
  flat_data.append(row)
194
 
 
198
  label="Language Results",
199
  show_search="search",
200
  datatype=[
201
+ "markdown", # Language
202
+ "number", # Speakers
203
+ # "number", # Models Tested
204
+ "number", # Overall
205
+ "number", # Translation
206
+ "number", # Classification
207
+ "markdown", # Best Model
208
+ "markdown", # CommonVoice Hours
209
  ],
210
  )
211
 
evals.py CHANGED
@@ -316,14 +316,18 @@ async def main():
316
  for score in classification_scores
317
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
318
  ]
 
 
319
  accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
 
320
  if translations_for_model:
321
  results_for_language.append(
322
  {
323
  "model": model,
324
- "bleu": mean([s["bleu"] for s in translations_for_model]),
325
- "chrf": mean([s["chrf"] for s in translations_for_model]),
326
  "accuracy": accuracy,
 
327
  }
328
  )
329
  if results_for_language:
@@ -336,6 +340,7 @@ async def main():
336
  "bleu": mean([s["bleu"] for s in results_for_language]),
337
  "chrf": mean([s["chrf"] for s in results_for_language]),
338
  "accuracy": mean([s["accuracy"] for s in results_for_language]),
 
339
  "commonvoice_hours": language.commonvoice_hours
340
  if not pd.isna(language.commonvoice_hours)
341
  else None,
 
316
  for score in classification_scores
317
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
318
  ]
319
+ bleu = mean([s["bleu"] for s in translations_for_model])
320
+ chrf = mean([s["chrf"] for s in translations_for_model])
321
  accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
322
+ overall_score = (bleu + accuracy) / 2
323
  if translations_for_model:
324
  results_for_language.append(
325
  {
326
  "model": model,
327
+ "bleu": bleu,
328
+ "chrf": chrf,
329
  "accuracy": accuracy,
330
+ "overall_score": overall_score,
331
  }
332
  )
333
  if results_for_language:
 
340
  "bleu": mean([s["bleu"] for s in results_for_language]),
341
  "chrf": mean([s["chrf"] for s in results_for_language]),
342
  "accuracy": mean([s["accuracy"] for s in results_for_language]),
343
+ "overall_score": mean([s["overall_score"] for s in results_for_language]),
344
  "commonvoice_hours": language.commonvoice_hours
345
  if not pd.isna(language.commonvoice_hours)
346
  else None,
results.json CHANGED
@@ -8,12 +8,14 @@
8
  "model": "meta-llama/llama-3.3-70b-instruct",
9
  "bleu": 0.4351349353198866,
10
  "chrf": 54.9504915580248,
11
- "accuracy": 1.0
 
12
  }
13
  ],
14
  "bleu": 0.4351349353198866,
15
  "chrf": 54.9504915580248,
16
  "accuracy": 1.0,
 
17
  "commonvoice_hours": 2651.0,
18
  "commonvoice_locale": "en",
19
  "population": {
@@ -183,12 +185,14 @@
183
  "model": "meta-llama/llama-3.3-70b-instruct",
184
  "bleu": 0.3977775857451761,
185
  "chrf": 57.672913792439125,
186
- "accuracy": 1.0
 
187
  }
188
  ],
189
  "bleu": 0.3977775857451761,
190
  "chrf": 57.672913792439125,
191
  "accuracy": 1.0,
 
192
  "commonvoice_hours": 422.0,
193
  "commonvoice_locale": "zh-TW",
194
  "population": {
@@ -223,12 +227,14 @@
223
  "model": "meta-llama/llama-3.3-70b-instruct",
224
  "bleu": 0.333521621016373,
225
  "chrf": 50.48364584189306,
226
- "accuracy": 0.9333333333333333
 
227
  }
228
  ],
229
  "bleu": 0.333521621016373,
230
  "chrf": 50.48364584189306,
231
  "accuracy": 0.9333333333333333,
 
232
  "commonvoice_hours": 16.0,
233
  "commonvoice_locale": "hi-IN",
234
  "population": {
@@ -249,12 +255,14 @@
249
  "model": "meta-llama/llama-3.3-70b-instruct",
250
  "bleu": 0.29160032861883095,
251
  "chrf": 47.668399832701844,
252
- "accuracy": 0.9666666666666667
 
253
  }
254
  ],
255
  "bleu": 0.29160032861883095,
256
  "chrf": 47.668399832701844,
257
  "accuracy": 0.9666666666666667,
 
258
  "commonvoice_hours": 446.0,
259
  "commonvoice_locale": "es",
260
  "population": {
@@ -308,12 +316,14 @@
308
  "model": "meta-llama/llama-3.3-70b-instruct",
309
  "bleu": 0.277257629790728,
310
  "chrf": 46.62779335380641,
311
- "accuracy": 0.9333333333333333
 
312
  }
313
  ],
314
  "bleu": 0.277257629790728,
315
  "chrf": 46.62779335380641,
316
  "accuracy": 0.9333333333333333,
 
317
  "commonvoice_hours": 91.0,
318
  "commonvoice_locale": "ar",
319
  "population": {
@@ -366,12 +376,14 @@
366
  "model": "meta-llama/llama-3.3-70b-instruct",
367
  "bleu": 0.2659144372728079,
368
  "chrf": 44.14831240898717,
369
- "accuracy": 0.8333333333333334
 
370
  }
371
  ],
372
  "bleu": 0.2659144372728079,
373
  "chrf": 44.14831240898717,
374
  "accuracy": 0.8333333333333334,
 
375
  "commonvoice_hours": 77.0,
376
  "commonvoice_locale": "ur",
377
  "population": {
@@ -391,12 +403,14 @@
391
  "model": "meta-llama/llama-3.3-70b-instruct",
392
  "bleu": 0.315663773358301,
393
  "chrf": 49.253978669350964,
394
- "accuracy": 0.9666666666666667
 
395
  }
396
  ],
397
  "bleu": 0.315663773358301,
398
  "chrf": 49.253978669350964,
399
  "accuracy": 0.9666666666666667,
 
400
  "commonvoice_hours": 1052.0,
401
  "commonvoice_locale": "fr",
402
  "population": {
@@ -473,12 +487,14 @@
473
  "model": "meta-llama/llama-3.3-70b-instruct",
474
  "bleu": 0.21265887286151353,
475
  "chrf": 41.501657722373686,
476
- "accuracy": 0.9333333333333333
 
477
  }
478
  ],
479
  "bleu": 0.21265887286151353,
480
  "chrf": 41.501657722373686,
481
  "accuracy": 0.9333333333333333,
 
482
  "commonvoice_hours": 49.0,
483
  "commonvoice_locale": "bn",
484
  "population": {
@@ -498,42 +514,49 @@
498
  "model": "openai/gpt-4o-mini",
499
  "bleu": 0.37370265193281843,
500
  "chrf": 57.010201314973216,
501
- "accuracy": 0.9666666666666667
 
502
  },
503
  {
504
  "model": "meta-llama/llama-3.3-70b-instruct",
505
  "bleu": 0.27514792195783394,
506
  "chrf": 45.901248962808694,
507
- "accuracy": 0.9666666666666667
 
508
  },
509
  {
510
  "model": "mistralai/mistral-small-24b-instruct-2501",
511
  "bleu": 0.3691905380990064,
512
  "chrf": 54.842418095352954,
513
- "accuracy": 0.9666666666666667
 
514
  },
515
  {
516
  "model": "google/gemini-2.0-flash-001",
517
  "bleu": 0.4020145367576223,
518
  "chrf": 60.73156386707501,
519
- "accuracy": 0.9
 
520
  },
521
  {
522
  "model": "deepseek/deepseek-chat",
523
  "bleu": 0.39831859400698993,
524
  "chrf": 59.99225659809846,
525
- "accuracy": 0.9666666666666667
 
526
  },
527
  {
528
  "model": "microsoft/phi-4",
529
  "bleu": 0.35576182901107084,
530
  "chrf": 56.05856754270042,
531
- "accuracy": 0.9
 
532
  }
533
  ],
534
  "bleu": 0.36235601196089035,
535
  "chrf": 55.756042730168126,
536
  "accuracy": 0.9444444444444445,
 
537
  "commonvoice_hours": 177.0,
538
  "commonvoice_locale": "pt",
539
  "population": {
@@ -564,12 +587,14 @@
564
  "model": "meta-llama/llama-3.3-70b-instruct",
565
  "bleu": 0.3048037308116852,
566
  "chrf": 48.4304965568793,
567
- "accuracy": 0.9666666666666667
 
568
  }
569
  ],
570
  "bleu": 0.3048037308116852,
571
  "chrf": 48.4304965568793,
572
  "accuracy": 0.9666666666666667,
 
573
  "commonvoice_hours": 2.3,
574
  "commonvoice_locale": "pa-IN",
575
  "population": {
 
8
  "model": "meta-llama/llama-3.3-70b-instruct",
9
  "bleu": 0.4351349353198866,
10
  "chrf": 54.9504915580248,
11
+ "accuracy": 1.0,
12
+ "overall_score": 0.7175674676599433
13
  }
14
  ],
15
  "bleu": 0.4351349353198866,
16
  "chrf": 54.9504915580248,
17
  "accuracy": 1.0,
18
+ "overall_score": 0.7175674676599433,
19
  "commonvoice_hours": 2651.0,
20
  "commonvoice_locale": "en",
21
  "population": {
 
185
  "model": "meta-llama/llama-3.3-70b-instruct",
186
  "bleu": 0.3977775857451761,
187
  "chrf": 57.672913792439125,
188
+ "accuracy": 1.0,
189
+ "overall_score": 0.698888792872588
190
  }
191
  ],
192
  "bleu": 0.3977775857451761,
193
  "chrf": 57.672913792439125,
194
  "accuracy": 1.0,
195
+ "overall_score": 0.698888792872588,
196
  "commonvoice_hours": 422.0,
197
  "commonvoice_locale": "zh-TW",
198
  "population": {
 
227
  "model": "meta-llama/llama-3.3-70b-instruct",
228
  "bleu": 0.333521621016373,
229
  "chrf": 50.48364584189306,
230
+ "accuracy": 0.9333333333333333,
231
+ "overall_score": 0.6334274771748531
232
  }
233
  ],
234
  "bleu": 0.333521621016373,
235
  "chrf": 50.48364584189306,
236
  "accuracy": 0.9333333333333333,
237
+ "overall_score": 0.6334274771748531,
238
  "commonvoice_hours": 16.0,
239
  "commonvoice_locale": "hi-IN",
240
  "population": {
 
255
  "model": "meta-llama/llama-3.3-70b-instruct",
256
  "bleu": 0.29160032861883095,
257
  "chrf": 47.668399832701844,
258
+ "accuracy": 0.9666666666666667,
259
+ "overall_score": 0.6291334976427488
260
  }
261
  ],
262
  "bleu": 0.29160032861883095,
263
  "chrf": 47.668399832701844,
264
  "accuracy": 0.9666666666666667,
265
+ "overall_score": 0.6291334976427488,
266
  "commonvoice_hours": 446.0,
267
  "commonvoice_locale": "es",
268
  "population": {
 
316
  "model": "meta-llama/llama-3.3-70b-instruct",
317
  "bleu": 0.277257629790728,
318
  "chrf": 46.62779335380641,
319
+ "accuracy": 0.9333333333333333,
320
+ "overall_score": 0.6052954815620306
321
  }
322
  ],
323
  "bleu": 0.277257629790728,
324
  "chrf": 46.62779335380641,
325
  "accuracy": 0.9333333333333333,
326
+ "overall_score": 0.6052954815620306,
327
  "commonvoice_hours": 91.0,
328
  "commonvoice_locale": "ar",
329
  "population": {
 
376
  "model": "meta-llama/llama-3.3-70b-instruct",
377
  "bleu": 0.2659144372728079,
378
  "chrf": 44.14831240898717,
379
+ "accuracy": 0.8333333333333334,
380
+ "overall_score": 0.5496238853030706
381
  }
382
  ],
383
  "bleu": 0.2659144372728079,
384
  "chrf": 44.14831240898717,
385
  "accuracy": 0.8333333333333334,
386
+ "overall_score": 0.5496238853030706,
387
  "commonvoice_hours": 77.0,
388
  "commonvoice_locale": "ur",
389
  "population": {
 
403
  "model": "meta-llama/llama-3.3-70b-instruct",
404
  "bleu": 0.315663773358301,
405
  "chrf": 49.253978669350964,
406
+ "accuracy": 0.9666666666666667,
407
+ "overall_score": 0.6411652200124838
408
  }
409
  ],
410
  "bleu": 0.315663773358301,
411
  "chrf": 49.253978669350964,
412
  "accuracy": 0.9666666666666667,
413
+ "overall_score": 0.6411652200124838,
414
  "commonvoice_hours": 1052.0,
415
  "commonvoice_locale": "fr",
416
  "population": {
 
487
  "model": "meta-llama/llama-3.3-70b-instruct",
488
  "bleu": 0.21265887286151353,
489
  "chrf": 41.501657722373686,
490
+ "accuracy": 0.9333333333333333,
491
+ "overall_score": 0.5729961030974234
492
  }
493
  ],
494
  "bleu": 0.21265887286151353,
495
  "chrf": 41.501657722373686,
496
  "accuracy": 0.9333333333333333,
497
+ "overall_score": 0.5729961030974234,
498
  "commonvoice_hours": 49.0,
499
  "commonvoice_locale": "bn",
500
  "population": {
 
514
  "model": "openai/gpt-4o-mini",
515
  "bleu": 0.37370265193281843,
516
  "chrf": 57.010201314973216,
517
+ "accuracy": 0.9666666666666667,
518
+ "overall_score": 0.6701846592997426
519
  },
520
  {
521
  "model": "meta-llama/llama-3.3-70b-instruct",
522
  "bleu": 0.27514792195783394,
523
  "chrf": 45.901248962808694,
524
+ "accuracy": 0.9666666666666667,
525
+ "overall_score": 0.6209072943122503
526
  },
527
  {
528
  "model": "mistralai/mistral-small-24b-instruct-2501",
529
  "bleu": 0.3691905380990064,
530
  "chrf": 54.842418095352954,
531
+ "accuracy": 0.9666666666666667,
532
+ "overall_score": 0.6679286023828366
533
  },
534
  {
535
  "model": "google/gemini-2.0-flash-001",
536
  "bleu": 0.4020145367576223,
537
  "chrf": 60.73156386707501,
538
+ "accuracy": 0.9,
539
+ "overall_score": 0.6510072683788112
540
  },
541
  {
542
  "model": "deepseek/deepseek-chat",
543
  "bleu": 0.39831859400698993,
544
  "chrf": 59.99225659809846,
545
+ "accuracy": 0.9666666666666667,
546
+ "overall_score": 0.6824926303368283
547
  },
548
  {
549
  "model": "microsoft/phi-4",
550
  "bleu": 0.35576182901107084,
551
  "chrf": 56.05856754270042,
552
+ "accuracy": 0.9,
553
+ "overall_score": 0.6278809145055354
554
  }
555
  ],
556
  "bleu": 0.36235601196089035,
557
  "chrf": 55.756042730168126,
558
  "accuracy": 0.9444444444444445,
559
+ "overall_score": 0.6534002282026674,
560
  "commonvoice_hours": 177.0,
561
  "commonvoice_locale": "pt",
562
  "population": {
 
587
  "model": "meta-llama/llama-3.3-70b-instruct",
588
  "bleu": 0.3048037308116852,
589
  "chrf": 48.4304965568793,
590
+ "accuracy": 0.9666666666666667,
591
+ "overall_score": 0.6357351987391759
592
  }
593
  ],
594
  "bleu": 0.3048037308116852,
595
  "chrf": 48.4304965568793,
596
  "accuracy": 0.9666666666666667,
597
+ "overall_score": 0.6357351987391759,
598
  "commonvoice_hours": 2.3,
599
  "commonvoice_locale": "pa-IN",
600
  "population": {