David Pomerenke commited on
Commit
7a9c651
·
1 Parent(s): 9f25f4c

Better results format (flatten + aggregate 3x), push results to hub

Browse files
Files changed (2) hide show
  1. evals.py +85 -157
  2. results.json +0 -0
evals.py CHANGED
@@ -12,9 +12,10 @@ import evaluate
12
  import pandas as pd
13
  import requests
14
  from aiolimiter import AsyncLimiter
 
15
  from dotenv import load_dotenv
16
  from elevenlabs import AsyncElevenLabs
17
- from huggingface_hub import AsyncInferenceClient
18
  from joblib.memory import Memory
19
  from langcodes import Language, standardize_tag
20
  from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
@@ -274,13 +275,19 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
274
  else:
275
  bleu_score = {"bleu": 0}
276
  chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
277
- return {
278
- "model": model,
279
- "bcp_47": original_language["bcp_47"],
280
- "mt_bleu": bleu_score["bleu"],
281
- "mt_chrf": chrf_score["score"] / 100,
282
- "sentence_nr": sentence_nr,
283
- }
 
 
 
 
 
 
284
 
285
 
286
  metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
@@ -331,16 +338,20 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
331
  max_tokens=5,
332
  )
333
  try:
334
- prediction = int(reply.choices[0].message.content.strip())
335
  except ValueError:
336
- prediction = -1
337
- return {
338
- "model": model,
339
- "bcp_47": language["bcp_47"],
340
- "true": topic_to_number(test_paragraph.topic),
341
- "pred": prediction,
342
- "sentence_nr": nr,
343
- }
 
 
 
 
344
 
345
 
346
  def corrupt_sentence(sentence):
@@ -381,12 +392,16 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
381
  )
382
  prediction = reply.choices[0].message.content.strip()
383
  chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
384
- return {
385
- "model": model,
386
- "bcp_47": language["bcp_47"],
387
- "mlm_chrf": chrf_score["score"] / 100,
388
- "sentence_nr": nr,
389
- }
 
 
 
 
390
 
391
 
392
  @cache
@@ -440,16 +455,25 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
440
  path = f"data/fleurs/{language.fleurs_tag}/audio/dev/{item.fname}"
441
  pred = await transcribe(path, model=model)
442
  wer_score = wer.compute(predictions=[pred], references=[item.transcription])
443
- chrf_score = chrf.compute(predictions=[pred], references=[item.transcription])
444
- return {
445
- "model": model,
446
- "bcp_47": language["bcp_47"],
447
- "asr_wer": wer_score,
448
- "asr_chrf": chrf_score["score"] / 100,
449
- "sentence_nr": nr,
450
- }
 
 
451
 
452
 
 
 
 
 
 
 
 
453
  # ===== run evaluation and aggregate results =====
454
 
455
 
@@ -458,9 +482,10 @@ def mean(lst):
458
 
459
 
460
  async def main():
461
- print("evaluate translation")
462
- translation_scores = [
463
- translate_and_evaluate(model, original_language.bcp_47, i)
 
464
  for i in range(n_sentences)
465
  for original_language in langs_eval.itertuples()
466
  for model in models
@@ -470,130 +495,33 @@ async def main():
470
  or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
471
  )
472
  ]
473
- translation_scores = await tqdm_asyncio.gather(*translation_scores, miniters=1)
474
- print("evaluate classification")
475
- classification_scores = [
476
- classify_and_evaluate(model, language.bcp_47, i)
477
- for i in range(n_sentences)
478
- for language in langs_eval.itertuples()
479
- for model in models
480
- if language.in_benchmark
481
- and (
482
- model == model_fast or language.bcp_47 in langs_eval_detailed.bcp_47.values
483
- )
484
- ]
485
- classification_scores = await tqdm_asyncio.gather(
486
- *classification_scores, miniters=1
487
  )
488
- print("evaluate masked language modeling")
489
- mlm_scores = [
490
- mlm_and_evaluate(model, language.bcp_47, i)
491
- for i in range(n_sentences)
492
- for language in langs_eval.itertuples()
493
- for model in models
494
- if language.in_benchmark
495
- and (
496
- model == model_fast or language.bcp_47 in langs_eval_detailed.bcp_47.values
497
- )
498
- ]
499
- mlm_scores = await tqdm_asyncio.gather(*mlm_scores, miniters=1)
500
- print("evaluate transcription")
501
- transcription_scores = [
502
- transcribe_and_evaluate(model, language.bcp_47, i)
503
- for i in range(n_sentences)
504
- for language in transcription_langs_eval.itertuples()
505
- for model in transcription_models
506
- if language.in_benchmark
507
- and (
508
- model == transcription_model_fast
509
- or language.bcp_47 in transcription_langs_eval_detailed.bcp_47.values
510
- )
511
- ]
512
- transcription_scores = await tqdm_asyncio.gather(*transcription_scores, miniters=1)
513
- all_results = []
514
- for language in languages.itertuples():
515
- results = []
516
- for model in models:
517
- scores_mt = [
518
- score
519
- for score in translation_scores
520
- if score["bcp_47"] == language.bcp_47 and score["model"] == model
521
- ]
522
- scores_cls = [
523
- score
524
- for score in classification_scores
525
- if score["bcp_47"] == language.bcp_47 and score["model"] == model
526
- ]
527
- scores_mlm = [
528
- score
529
- for score in mlm_scores
530
- if score["bcp_47"] == language.bcp_47 and score["model"] == model
531
- ]
532
- if not scores_mt:
533
- continue
534
- mt_bleu = mean([s["mt_bleu"] for s in scores_mt])
535
- mt_chrf = mean([s["mt_chrf"] for s in scores_mt])
536
- cls_acc = mean([s["true"] == s["pred"] for s in scores_cls])
537
- mlm_chrf = mean([s["mlm_chrf"] for s in scores_mlm])
538
- t2t_score = (mt_chrf + cls_acc + mlm_chrf) / 3
539
- results.append(
540
- {
541
- "model": model,
542
- "model_type": "text-to-text",
543
- "mt_bleu": mt_bleu,
544
- "mt_chrf": mt_chrf,
545
- "cls_acc": cls_acc,
546
- "mlm_chrf": mlm_chrf,
547
- "t2t_score": t2t_score,
548
- }
549
- )
550
- for model in transcription_models:
551
- scores_asr = [
552
- score
553
- for score in transcription_scores
554
- if score["bcp_47"] == language.bcp_47 and score["model"] == model
555
- ]
556
- if not scores_asr:
557
- continue
558
- asr_wer = mean([s["asr_wer"] for s in scores_asr])
559
- asr_chrf = mean([s["asr_chrf"] for s in scores_asr])
560
- results.append(
561
- {
562
- "model": model,
563
- "model_type": "speech-to-text",
564
- "asr_wer": asr_wer,
565
- "asr_chrf": asr_chrf,
566
- "s2t_score": (asr_wer + asr_chrf) / 2,
567
- }
568
- )
569
- language_results = {
570
- "language_name": language.language_name,
571
- "bcp_47": language.bcp_47,
572
- "speakers": language.speakers,
573
- "scores": results,
574
- "commonvoice_hours": language.commonvoice_hours
575
- if not pd.isna(language.commonvoice_hours)
576
- else None,
577
- "commonvoice_locale": language.commonvoice_locale
578
- if not pd.isna(language.commonvoice_locale)
579
- else None,
580
- "population": population(language.bcp_47),
581
- "language_family": language_family(language.bcp_47),
582
- }
583
- for score in [
584
- "mt_bleu",
585
- "mt_chrf",
586
- "cls_acc",
587
- "mlm_chrf",
588
- "asr_wer",
589
- "asr_chrf",
590
- "t2t_score",
591
- "s2t_score",
592
- ]:
593
- language_results[score] = mean([s[score] for s in results if score in s])
594
- all_results.append(language_results)
595
- with open("results.json", "w") as f:
596
- json.dump(all_results, f, indent=2, ensure_ascii=False)
597
 
598
 
599
  if __name__ == "__main__":
 
12
  import pandas as pd
13
  import requests
14
  from aiolimiter import AsyncLimiter
15
+ from datasets import Dataset
16
  from dotenv import load_dotenv
17
  from elevenlabs import AsyncElevenLabs
18
+ from huggingface_hub import AsyncInferenceClient, HfApi
19
  from joblib.memory import Memory
20
  from langcodes import Language, standardize_tag
21
  from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
 
275
  else:
276
  bleu_score = {"bleu": 0}
277
  chrf_score = chrf.compute(predictions=[prediction], references=[target_sentence])
278
+ return [
279
+ {
280
+ "model": model,
281
+ "bcp_47": original_language["bcp_47"],
282
+ "task": "translation",
283
+ "metric": metric,
284
+ "score": score,
285
+ "sentence_nr": sentence_nr,
286
+ }
287
+ for metric, score in zip(
288
+ ["bleu", "chrf"], [bleu_score["bleu"], chrf_score["score"] / 100]
289
+ )
290
+ ]
291
 
292
 
293
  metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
 
338
  max_tokens=5,
339
  )
340
  try:
341
+ pred = int(reply.choices[0].message.content.strip())
342
  except ValueError:
343
+ pred = -1
344
+ true = topic_to_number(test_paragraph.topic)
345
+ return [
346
+ {
347
+ "model": model,
348
+ "bcp_47": language["bcp_47"],
349
+ "task": "classification",
350
+ "metric": "accuracy",
351
+ "score": int(pred == true),
352
+ "sentence_nr": nr,
353
+ }
354
+ ]
355
 
356
 
357
  def corrupt_sentence(sentence):
 
392
  )
393
  prediction = reply.choices[0].message.content.strip()
394
  chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
395
+ return [
396
+ {
397
+ "model": model,
398
+ "bcp_47": language["bcp_47"],
399
+ "task": "language_modeling",
400
+ "metric": "chrf",
401
+ "score": chrf_score["score"] / 100,
402
+ "sentence_nr": nr,
403
+ }
404
+ ]
405
 
406
 
407
  @cache
 
455
  path = f"data/fleurs/{language.fleurs_tag}/audio/dev/{item.fname}"
456
  pred = await transcribe(path, model=model)
457
  wer_score = wer.compute(predictions=[pred], references=[item.transcription])
458
+ return [
459
+ {
460
+ "model": model,
461
+ "bcp_47": language["bcp_47"],
462
+ "task": "asr",
463
+ "metric": "wer",
464
+ "score": wer_score,
465
+ "sentence_nr": nr,
466
+ }
467
+ ]
468
 
469
 
470
+ tasks = [
471
+ translate_and_evaluate,
472
+ classify_and_evaluate,
473
+ mlm_and_evaluate,
474
+ # transcribe_and_evaluate,
475
+ ]
476
+
477
  # ===== run evaluation and aggregate results =====
478
 
479
 
 
482
 
483
 
484
  async def main():
485
+ print("running evaluations")
486
+ results = [
487
+ task(model, original_language.bcp_47, i)
488
+ for task in tasks
489
  for i in range(n_sentences)
490
  for original_language in langs_eval.itertuples()
491
  for model in models
 
495
  or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
496
  )
497
  ]
498
+ results = await tqdm_asyncio.gather(*results, miniters=1)
499
+ results = pd.DataFrame([r for rs in results for r in rs])
500
+ results = (
501
+ results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
 
 
 
 
 
 
 
 
 
 
502
  )
503
+ lang_results = (
504
+ results.groupby(["bcp_47", "task", "metric"])
505
+ .agg({"score": "mean", "model": "nunique"})
506
+ .reset_index()
507
+ )
508
+ lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
509
+ model_results = (
510
+ results.groupby(["model", "task", "metric"])
511
+ .agg({"score": "mean", "bcp_47": "nunique"})
512
+ .reset_index()
513
+ )
514
+ task_results = (
515
+ results.groupby(["task", "metric"])
516
+ .agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
517
+ .reset_index()
518
+ )
519
+ HF_REPO = "datenlabor-bmz/global-language-ai-evals"
520
+ HF_TOKEN = getenv("HUGGINGFACE_ACCESS_TOKEN")
521
+ Dataset.from_pandas(results).push_to_hub(HF_REPO, "scores", token=HF_TOKEN)
522
+ Dataset.from_pandas(lang_results).push_to_hub(HF_REPO, "languages", token=HF_TOKEN)
523
+ Dataset.from_pandas(model_results).push_to_hub(HF_REPO, "models", token=HF_TOKEN)
524
+ Dataset.from_pandas(task_results).push_to_hub(HF_REPO, "tasks", token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
 
526
 
527
  if __name__ == "__main__":
results.json DELETED
The diff for this file is too large to render. See raw diff