File size: 4,976 Bytes
da6e1bc 723f963 da6e1bc 723f963 da6e1bc 723f963 da6e1bc 3ed02d5 da6e1bc 3ed02d5 da6e1bc 3ed02d5 da6e1bc 3ed02d5 da6e1bc 3ed02d5 da6e1bc 3ed02d5 723f963 3ed02d5 da6e1bc d1a7111 723f963 d1a7111 430bde6 723f963 d1a7111 723f963 da6e1bc 3ed02d5 11c32ae 723f963 da6e1bc d1a7111 723f963 11c32ae 723f963 da6e1bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import asyncio
import json
import numpy as np
import pandas as pd
from countries import make_country_table
from languages import languages
from models import model_fast, models
from rich import print
from tasks import tasks
from tqdm.asyncio import tqdm_asyncio
# ===== config =====
n_sentences = 30
langs_eval = languages.iloc[:10]
langs_eval_detailed = languages.iloc[:2]
transcription_langs_eval = languages.iloc[:10]
transcription_langs_eval_detailed = languages.iloc[:5]
# ===== run evaluation and aggregate results =====
async def evaluate():
print("running evaluations")
results = [
task(model, original_language.bcp_47, i)
for task in tasks
for i in range(n_sentences)
for original_language in langs_eval.itertuples()
for model in models["id"]
if original_language.in_benchmark
and (
model == model_fast
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
)
]
return await tqdm_asyncio.gather(*results, miniters=1)
def aggregate(results):
results = pd.DataFrame([r for rs in results for r in rs])
results = (
results.groupby(["model", "bcp_47", "task", "metric"]).mean().reset_index()
)
lang_results = (
results.groupby(["bcp_47", "task", "metric"])
.agg({"score": "mean", "model": "nunique"})
.reset_index()
)
lang_results = pd.merge(languages, lang_results, on="bcp_47", how="outer")
model_results = (
results.groupby(["model", "task", "metric"])
.agg({"score": "mean", "bcp_47": "nunique"})
.reset_index()
)
task_results = (
results.groupby(["task", "metric"])
.agg({"score": "mean", "bcp_47": "nunique", "model": "nunique"})
.reset_index()
)
return results, lang_results, model_results, task_results
def mean(lst):
return sum(lst) / len(lst) if lst else None
def fmt_name(s):
return (
" ".join(w.capitalize() for w in s.split("-"))
.replace("Gpt", "GPT")
.replace("ai", "AI")
)
def serialize(df):
return df.replace({np.nan: None}).to_dict(orient="records")
def make_model_table(df):
df["task_metric"] = df["task"] + "_" + df["metric"]
df = df.drop(columns=["task", "metric"])
task_metrics = df["task_metric"].unique()
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
df["average"] = df[task_metrics].mean(axis=1)
df = df.sort_values(by="average", ascending=False).reset_index()
for row in [*task_metrics, "average"]:
df[row] = df[row].round(2)
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
df["creation_date"] = df["creation_date"].dt.strftime("%Y-%m-%d")
df["provider"] = df["model"].str.split("/").str[0].apply(fmt_name)
df["model"] = df["model"].str.split("/").str[1].apply(fmt_name)
df["rank"] = df.index + 1
df = df[
[
"rank",
"provider",
"model",
"hf_id",
"creation_date",
"size",
"type",
"license",
"average",
*task_metrics,
]
]
return df
def make_language_table(df):
df["task_metric"] = df["task"] + "_" + df["metric"]
df = df.drop(columns=["task", "metric"])
task_metrics = df["task_metric"].unique()
df = (
df.pivot(index="bcp_47", columns="task_metric", values="score")
.fillna(0)
.reset_index()
)
df["average"] = df[task_metrics].mean(axis=1)
for row in [*task_metrics, "average"]:
df[row] = df[row].round(2)
df = pd.merge(languages, df, on="bcp_47", how="outer")
df = df.sort_values(by="speakers", ascending=False)
df = df[
[
"bcp_47",
"language_name",
"autonym",
"speakers",
"family",
"average",
"in_benchmark",
*task_metrics,
]
]
return df
async def main():
results = await evaluate()
results, lang_results, model_results, task_results = aggregate(results)
all_results = {
"tasks": serialize(task_results),
"models": serialize(model_results),
"languages": serialize(lang_results),
"scores": serialize(results),
}
with open("results.json", "w") as f:
json.dump(all_results, f, indent=2, ensure_ascii=False)
datasets_df = pd.read_json("data/datasets.json")
language_table = make_language_table(lang_results)
all_tables = {
"model_table": serialize(make_model_table(model_results)),
"language_table": serialize(language_table),
"dataset_table": serialize(datasets_df),
"countries": make_country_table(language_table),
}
with open("frontend/public/results.json", "w") as f:
json.dump(all_tables, f, indent=2, ensure_ascii=False)
if __name__ == "__main__":
asyncio.run(main())
|