File size: 3,657 Bytes
da6e1bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import asyncio
import json

import numpy as np
import pandas as pd
from rich import print
from tqdm.asyncio import tqdm_asyncio
from languages import languages
from tasks import tasks
from models import models, model_fast

# ===== config =====

n_sentences = 30
langs_eval = languages.iloc[:10]
langs_eval_detailed = languages.iloc[:2]
transcription_langs_eval = languages.iloc[:10]
transcription_langs_eval_detailed = languages.iloc[:5]

# ===== run evaluation and aggregate results =====

async def evaluate():
    print("running evaluations")
    results = [
        task(model, original_language.bcp_47, i)
        for task in tasks
        for i in range(n_sentences)
        for original_language in langs_eval.itertuples()
        for model in models
        if original_language.in_benchmark
        and (
            model == model_fast
            or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
        )
    ]
    return await tqdm_asyncio.gather(*results, miniters=1)

def aggregate(results):
    results = pd.DataFrame([r for rs in results for r in rs])
    results = (
        results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
    )
    lang_results = (
        results.groupby(["bcp_47", "task", "metric"])
        .agg({"score": "mean", "model": "nunique"})
        .reset_index()
    )
    lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
    model_results = (
        results.groupby(["model", "task", "metric"])
        .agg({"score": "mean", "bcp_47": "nunique"})
        .reset_index()
    )
    task_results = (
        results.groupby(["task", "metric"])
        .agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
        .reset_index()
    )
    return results, lang_results, model_results, task_results

def mean(lst):
    return sum(lst) / len(lst) if lst else None


def fmt_name(s):
    return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")

def serialize(df):
    return df.replace({np.nan: None}).to_dict(orient="records")

def make_model_table(model_results):
    model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
    model_results = model_results.drop(columns=["task", "metric"])
    model_table = model_results.pivot(
        index="model", columns="task_metric", values="score"
    ).fillna(0)
    model_table["average"] = model_table.mean(axis=1)
    model_table = model_table.sort_values(by="average", ascending=False)
    model_table = model_table.round(2).reset_index()
    model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
    model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
    model_table["rank"] = model_table.index + 1
    model_table = model_table[
        ["rank", "provider", "model", "average", *model_table.columns[1:-3]]
    ]
    return model_table


async def main():
    results = await evaluate()
    results, lang_results, model_results, task_results = aggregate(results)
    all_results = {
        "tasks": serialize(task_results),
        "models": serialize(model_results),
        "languages": serialize(lang_results),
        "scores": serialize(results),
    }
    with open("results.json", "w") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    
    model_table = make_model_table(model_results)
    all_tables = {
        "model_table": serialize(model_table),
    }
    with open("frontend/public/results.json", "w") as f:
        json.dump(all_tables, f, indent=2, ensure_ascii=False)


if __name__ == "__main__":
    asyncio.run(main())