David Pomerenke
commited on
Commit
·
260c1a3
1
Parent(s):
3680a5f
Run on 40 languages, additional models
Browse files- evals/datasets_/mmlu.py +1 -0
- evals/main.py +3 -2
- evals/models.py +13 -9
- evals/tasks.py +13 -7
- frontend/src/components/SpeakerPlot.js +3 -3
- results.json +0 -0
evals/datasets_/mmlu.py
CHANGED
@@ -156,6 +156,7 @@ def load_mmlu(language_bcp_47, nr):
|
|
156 |
task = ds["test"].filter(lambda x: x["subject"] == category)[i]
|
157 |
return "CohereForAI/Global-MMLU", examples, task
|
158 |
elif language_bcp_47 in tags_okapi:
|
|
|
159 |
ds = _load_dataset(
|
160 |
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
161 |
)
|
|
|
156 |
task = ds["test"].filter(lambda x: x["subject"] == category)[i]
|
157 |
return "CohereForAI/Global-MMLU", examples, task
|
158 |
elif language_bcp_47 in tags_okapi:
|
159 |
+
return None, None, None # FIXME
|
160 |
ds = _load_dataset(
|
161 |
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
162 |
)
|
evals/main.py
CHANGED
@@ -12,8 +12,8 @@ from tasks import tasks
|
|
12 |
# ===== config =====
|
13 |
|
14 |
n_sentences = 10
|
15 |
-
n_languages =
|
16 |
-
n_models =
|
17 |
|
18 |
# ===== run evaluation and aggregate results =====
|
19 |
|
@@ -26,6 +26,7 @@ async def evaluate():
|
|
26 |
for i in range(n_sentences)
|
27 |
for lang in languages.iloc[:n_languages].itertuples()
|
28 |
for model in models["id"].iloc[:n_models]
|
|
|
29 |
]
|
30 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
31 |
|
|
|
12 |
# ===== config =====
|
13 |
|
14 |
n_sentences = 10
|
15 |
+
n_languages = 40
|
16 |
+
n_models = 25
|
17 |
|
18 |
# ===== run evaluation and aggregate results =====
|
19 |
|
|
|
26 |
for i in range(n_sentences)
|
27 |
for lang in languages.iloc[:n_languages].itertuples()
|
28 |
for model in models["id"].iloc[:n_models]
|
29 |
+
if lang.in_benchmark # TODO
|
30 |
]
|
31 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
32 |
|
evals/models.py
CHANGED
@@ -20,22 +20,26 @@ models = [
|
|
20 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$
|
21 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
22 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
23 |
-
# "meta-llama/llama-2-70b-chat", # 0.9$; not
|
|
|
24 |
"openai/gpt-4.1-nano", # 0.4$
|
25 |
"openai/gpt-4o-mini", # 0.6$
|
26 |
-
|
27 |
-
|
28 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
29 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
30 |
-
|
31 |
-
|
32 |
"google/gemini-2.5-flash-preview", # 0.6$
|
33 |
-
|
34 |
"google/gemma-3-27b-it", # 0.2$
|
35 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
36 |
-
"qwen/qwq-32b", # 0.2$
|
|
|
|
|
37 |
"deepseek/deepseek-chat-v3-0324", # 1.1$
|
38 |
-
|
|
|
39 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
40 |
"amazon/nova-micro-v1", # 0.09$
|
41 |
]
|
@@ -152,7 +156,7 @@ def get_hf_metadata(row):
|
|
152 |
return empty
|
153 |
try:
|
154 |
info = api.model_info(id)
|
155 |
-
license = info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
|
156 |
return {
|
157 |
"hf_id": info.id,
|
158 |
"creation_date": info.created_at,
|
|
|
20 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$
|
21 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
22 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
23 |
+
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
24 |
+
"openai/gpt-4.1-mini", # 1.6$
|
25 |
"openai/gpt-4.1-nano", # 0.4$
|
26 |
"openai/gpt-4o-mini", # 0.6$
|
27 |
+
"openai/gpt-3.5-turbo-0613", # 2$
|
28 |
+
"openai/gpt-3.5-turbo", # 1.5$
|
29 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
30 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
31 |
+
"mistralai/mistral-saba", # 0.6$
|
32 |
+
"mistralai/mistral-nemo", # 0.08$
|
33 |
"google/gemini-2.5-flash-preview", # 0.6$
|
34 |
+
"google/gemini-2.0-flash-lite-001", # 0.3$
|
35 |
"google/gemma-3-27b-it", # 0.2$
|
36 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
37 |
+
# "qwen/qwq-32b", # 0.2$
|
38 |
+
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
39 |
+
# "qwen/qwen-2-72b-instruct", # 0.9$
|
40 |
"deepseek/deepseek-chat-v3-0324", # 1.1$
|
41 |
+
"deepseek/deepseek-chat", # 0.89$
|
42 |
+
"microsoft/phi-4", # 0.07$
|
43 |
"microsoft/phi-4-multimodal-instruct", # 0.1$
|
44 |
"amazon/nova-micro-v1", # 0.09$
|
45 |
]
|
|
|
156 |
return empty
|
157 |
try:
|
158 |
info = api.model_info(id)
|
159 |
+
license = (info.card_data.license or "").replace("-", " ").replace("mit", "MIT").title()
|
160 |
return {
|
161 |
"hf_id": info.id,
|
162 |
"creation_date": info.created_at,
|
evals/tasks.py
CHANGED
@@ -221,13 +221,19 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
221 |
{"role": "assistant", "content": example["answer"]},
|
222 |
]
|
223 |
messages += [{"role": "user", "content": format_item(task)}]
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
return [
|
232 |
{
|
233 |
"model": model,
|
|
|
221 |
{"role": "assistant", "content": example["answer"]},
|
222 |
]
|
223 |
messages += [{"role": "user", "content": format_item(task)}]
|
224 |
+
try:
|
225 |
+
reply = await complete(
|
226 |
+
model=model,
|
227 |
+
messages=messages,
|
228 |
+
temperature=0,
|
229 |
+
max_tokens=1,
|
230 |
+
)
|
231 |
+
acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
|
232 |
+
except Exception as e:
|
233 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
234 |
+
acc = 0
|
235 |
+
else:
|
236 |
+
raise e
|
237 |
return [
|
238 |
{
|
239 |
"model": model,
|
frontend/src/components/SpeakerPlot.js
CHANGED
@@ -73,9 +73,9 @@ const SpeakerPlot = ({ data }) => {
|
|
73 |
textStrokeOpacity: 0,
|
74 |
textFillOpacity: 0
|
75 |
}),
|
76 |
-
Plot.tip(['The
|
77 |
-
x:
|
78 |
-
y: languages[
|
79 |
})
|
80 |
]
|
81 |
})
|
|
|
73 |
textStrokeOpacity: 0,
|
74 |
textFillOpacity: 0
|
75 |
}),
|
76 |
+
Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
77 |
+
x: 40,
|
78 |
+
y: languages[39].cumSpeakers / 1e6
|
79 |
})
|
80 |
]
|
81 |
})
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|