David Pomerenke
Autonymns and cooler dataset search display
33469f2
raw
history blame
4.53 kB
import asyncio
import json
import numpy as np
import pandas as pd
from rich import print
from tqdm.asyncio import tqdm_asyncio
from languages import languages
from tasks import tasks
from models import models, model_fast
# ===== config =====
n_sentences = 30
langs_eval = languages.iloc[:10]
langs_eval_detailed = languages.iloc[:2]
transcription_langs_eval = languages.iloc[:10]
transcription_langs_eval_detailed = languages.iloc[:5]
# ===== run evaluation and aggregate results =====
async def evaluate():
print("running evaluations")
results = [
task(model, original_language.bcp_47, i)
for task in tasks
for i in range(n_sentences)
for original_language in langs_eval.itertuples()
for model in models["id"]
if original_language.in_benchmark
and (
model == model_fast
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
)
]
return await tqdm_asyncio.gather(*results, miniters=1)
def aggregate(results):
results = pd.DataFrame([r for rs in results for r in rs])
results = (
results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
)
lang_results = (
results.groupby(["bcp_47", "task", "metric"])
.agg({"score": "mean", "model": "nunique"})
.reset_index()
)
lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
model_results = (
results.groupby(["model", "task", "metric"])
.agg({"score": "mean", "bcp_47": "nunique"})
.reset_index()
)
task_results = (
results.groupby(["task", "metric"])
.agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
.reset_index()
)
return results, lang_results, model_results, task_results
def mean(lst):
return sum(lst) / len(lst) if lst else None
def fmt_name(s):
return (
" ".join(w.capitalize() for w in s.split("-"))
.replace("Gpt", "GPT")
.replace("ai", "AI")
)
def serialize(df):
return df.replace({np.nan: None}).to_dict(orient="records")
def make_model_table(df):
df["task_metric"] = df["task"] + "_" + df["metric"]
df = df.drop(columns=["task", "metric"])
task_metrics = df["task_metric"].unique()
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
df["average"] = df[task_metrics].mean(axis=1)
df = df.sort_values(by="average", ascending=False).reset_index()
for row in [*task_metrics, "average"]:
df[row] = df[row].round(2)
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
df["creation_date"] = df["creation_date"].dt.strftime("%Y-%m-%d")
df["provider"] = df["model"].str.split("/").str[0].apply(fmt_name)
df["model"] = df["model"].str.split("/").str[1].apply(fmt_name)
df["rank"] = df.index + 1
df = df[["rank", "provider", "model", "hf_id", "creation_date", "size", "type", "license", "average", *task_metrics]]
return df
def make_language_table(df):
df["task_metric"] = df["task"] + "_" + df["metric"]
df = df.drop(columns=["task", "metric"])
task_metrics = df["task_metric"].unique()
df = df.pivot(index="bcp_47", columns="task_metric", values="score").fillna(0).reset_index()
df["average"] = df[task_metrics].mean(axis=1)
for row in [*task_metrics, "average"]:
df[row] = df[row].round(2)
df = pd.merge(languages, df, on="bcp_47", how="outer")
df = df.sort_values(by="speakers", ascending=False)
df = df[["language_name", "autonym", "speakers", "family", "average", "in_benchmark", *task_metrics]]
return df
async def main():
results = await evaluate()
results, lang_results, model_results, task_results = aggregate(results)
all_results = {
"tasks": serialize(task_results),
"models": serialize(model_results),
"languages": serialize(lang_results),
"scores": serialize(results),
}
with open("results.json", "w") as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
datasets_df = pd.read_json("data/datasets.json")
all_tables = {
"model_table": serialize(make_model_table(model_results)),
"language_table": serialize(make_language_table(lang_results)),
"dataset_table": serialize(datasets_df),
}
with open("frontend/public/results.json", "w") as f:
json.dump(all_tables, f, indent=2, ensure_ascii=False)
if __name__ == "__main__":
asyncio.run(main())