File size: 3,657 Bytes
da6e1bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import asyncio
import json
import numpy as np
import pandas as pd
from rich import print
from tqdm.asyncio import tqdm_asyncio
from languages import languages
from tasks import tasks
from models import models, model_fast
# ===== config =====
n_sentences = 30
langs_eval = languages.iloc[:10]
langs_eval_detailed = languages.iloc[:2]
transcription_langs_eval = languages.iloc[:10]
transcription_langs_eval_detailed = languages.iloc[:5]
# ===== run evaluation and aggregate results =====
async def evaluate():
print("running evaluations")
results = [
task(model, original_language.bcp_47, i)
for task in tasks
for i in range(n_sentences)
for original_language in langs_eval.itertuples()
for model in models
if original_language.in_benchmark
and (
model == model_fast
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
)
]
return await tqdm_asyncio.gather(*results, miniters=1)
def aggregate(results):
results = pd.DataFrame([r for rs in results for r in rs])
results = (
results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
)
lang_results = (
results.groupby(["bcp_47", "task", "metric"])
.agg({"score": "mean", "model": "nunique"})
.reset_index()
)
lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
model_results = (
results.groupby(["model", "task", "metric"])
.agg({"score": "mean", "bcp_47": "nunique"})
.reset_index()
)
task_results = (
results.groupby(["task", "metric"])
.agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
.reset_index()
)
return results, lang_results, model_results, task_results
def mean(lst):
return sum(lst) / len(lst) if lst else None
def fmt_name(s):
return " ".join(w.capitalize() for w in s.split("-")).replace("Gpt", "GPT").replace("ai", "AI")
def serialize(df):
return df.replace({np.nan: None}).to_dict(orient="records")
def make_model_table(model_results):
model_results["task_metric"] = model_results["task"] + "_" + model_results["metric"]
model_results = model_results.drop(columns=["task", "metric"])
model_table = model_results.pivot(
index="model", columns="task_metric", values="score"
).fillna(0)
model_table["average"] = model_table.mean(axis=1)
model_table = model_table.sort_values(by="average", ascending=False)
model_table = model_table.round(2).reset_index()
model_table["provider"] = model_table["model"].str.split("/").str[0].apply(fmt_name)
model_table["model"] = model_table["model"].str.split("/").str[1].apply(fmt_name)
model_table["rank"] = model_table.index + 1
model_table = model_table[
["rank", "provider", "model", "average", *model_table.columns[1:-3]]
]
return model_table
async def main():
results = await evaluate()
results, lang_results, model_results, task_results = aggregate(results)
all_results = {
"tasks": serialize(task_results),
"models": serialize(model_results),
"languages": serialize(lang_results),
"scores": serialize(results),
}
with open("results.json", "w") as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
model_table = make_model_table(model_results)
all_tables = {
"model_table": serialize(model_table),
}
with open("frontend/public/results.json", "w") as f:
json.dump(all_tables, f, indent=2, ensure_ascii=False)
if __name__ == "__main__":
asyncio.run(main())
|