David Pomerenke commited on
Commit
260c1a3
·
1 Parent(s): 3680a5f

Run on 40 languages, additional models

Browse files
evals/datasets_/mmlu.py CHANGED
@@ -156,6 +156,7 @@ def load_mmlu(language_bcp_47, nr):
156
  task = ds["test"].filter(lambda x: x["subject"] == category)[i]
157
  return "CohereForAI/Global-MMLU", examples, task
158
  elif language_bcp_47 in tags_okapi:
 
159
  ds = _load_dataset(
160
  "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
161
  )
 
156
  task = ds["test"].filter(lambda x: x["subject"] == category)[i]
157
  return "CohereForAI/Global-MMLU", examples, task
158
  elif language_bcp_47 in tags_okapi:
159
+ return None, None, None # FIXME
160
  ds = _load_dataset(
161
  "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
162
  )
evals/main.py CHANGED
@@ -12,8 +12,8 @@ from tasks import tasks
12
  # ===== config =====
13
 
14
  n_sentences = 10
15
- n_languages = 15
16
- n_models = 20
17
 
18
  # ===== run evaluation and aggregate results =====
19
 
@@ -26,6 +26,7 @@ async def evaluate():
26
  for i in range(n_sentences)
27
  for lang in languages.iloc[:n_languages].itertuples()
28
  for model in models["id"].iloc[:n_models]
 
29
  ]
30
  return await tqdm_asyncio.gather(*results, miniters=1)
31
 
 
12
  # ===== config =====
13
 
14
  n_sentences = 10
15
+ n_languages = 40
16
+ n_models = 25
17
 
18
  # ===== run evaluation and aggregate results =====
19
 
 
26
  for i in range(n_sentences)
27
  for lang in languages.iloc[:n_languages].itertuples()
28
  for model in models["id"].iloc[:n_models]
29
+ if lang.in_benchmark # TODO
30
  ]
31
  return await tqdm_asyncio.gather(*results, miniters=1)
32
 
evals/models.py CHANGED
@@ -20,22 +20,26 @@ models = [
20
  "meta-llama/llama-3.3-70b-instruct", # 0.3$
21
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
22
  "meta-llama/llama-3-70b-instruct", # 0.4$
23
- # "meta-llama/llama-2-70b-chat", # 0.9$; not enough context
 
24
  "openai/gpt-4.1-nano", # 0.4$
25
  "openai/gpt-4o-mini", # 0.6$
26
- # "openai/gpt-3.5-turbo-0613", # 2$
27
- # "openai/gpt-3.5-turbo", # 1.5$
28
  # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
29
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
30
- # "mistralai/mistral-saba", # 0.6$
31
- # "mistralai/mistral-nemo", # 0.08$
32
  "google/gemini-2.5-flash-preview", # 0.6$
33
- # "google/gemini-2.0-flash-lite-001", # 0.3$
34
  "google/gemma-3-27b-it", # 0.2$
35
  # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
36
- "qwen/qwq-32b", # 0.2$
 
 
37
  "deepseek/deepseek-chat-v3-0324", # 1.1$
38
- # "microsoft/phi-4", # 0.07$; only 16k tokens context
 
39
  "microsoft/phi-4-multimodal-instruct", # 0.1$
40
  "amazon/nova-micro-v1", # 0.09$
41
  ]
@@ -152,7 +156,7 @@ def get_hf_metadata(row):
152
  return empty
153
  try:
154
  info = api.model_info(id)
155
- license = info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
156
  return {
157
  "hf_id": info.id,
158
  "creation_date": info.created_at,
 
20
  "meta-llama/llama-3.3-70b-instruct", # 0.3$
21
  "meta-llama/llama-3.1-70b-instruct", # 0.3$
22
  "meta-llama/llama-3-70b-instruct", # 0.4$
23
+ # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
24
+ "openai/gpt-4.1-mini", # 1.6$
25
  "openai/gpt-4.1-nano", # 0.4$
26
  "openai/gpt-4o-mini", # 0.6$
27
+ "openai/gpt-3.5-turbo-0613", # 2$
28
+ "openai/gpt-3.5-turbo", # 1.5$
29
  # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
30
  "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
31
+ "mistralai/mistral-saba", # 0.6$
32
+ "mistralai/mistral-nemo", # 0.08$
33
  "google/gemini-2.5-flash-preview", # 0.6$
34
+ "google/gemini-2.0-flash-lite-001", # 0.3$
35
  "google/gemma-3-27b-it", # 0.2$
36
  # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
37
+ # "qwen/qwq-32b", # 0.2$
38
+ # "qwen/qwen-2.5-72b-instruct", # 0.39$
39
+ # "qwen/qwen-2-72b-instruct", # 0.9$
40
  "deepseek/deepseek-chat-v3-0324", # 1.1$
41
+ "deepseek/deepseek-chat", # 0.89$
42
+ "microsoft/phi-4", # 0.07$
43
  "microsoft/phi-4-multimodal-instruct", # 0.1$
44
  "amazon/nova-micro-v1", # 0.09$
45
  ]
 
156
  return empty
157
  try:
158
  info = api.model_info(id)
159
+ license = (info.card_data.license or "").replace("-", " ").replace("mit", "MIT").title()
160
  return {
161
  "hf_id": info.id,
162
  "creation_date": info.created_at,
evals/tasks.py CHANGED
@@ -221,13 +221,19 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
221
  {"role": "assistant", "content": example["answer"]},
222
  ]
223
  messages += [{"role": "user", "content": format_item(task)}]
224
- reply = await complete(
225
- model=model,
226
- messages=messages,
227
- temperature=0,
228
- max_tokens=1,
229
- )
230
- acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
 
 
 
 
 
 
231
  return [
232
  {
233
  "model": model,
 
221
  {"role": "assistant", "content": example["answer"]},
222
  ]
223
  messages += [{"role": "user", "content": format_item(task)}]
224
+ try:
225
+ reply = await complete(
226
+ model=model,
227
+ messages=messages,
228
+ temperature=0,
229
+ max_tokens=1,
230
+ )
231
+ acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
232
+ except Exception as e:
233
+ if "ResponsibleAIPolicyViolation" in str(e):
234
+ acc = 0
235
+ else:
236
+ raise e
237
  return [
238
  {
239
  "model": model,
frontend/src/components/SpeakerPlot.js CHANGED
@@ -73,9 +73,9 @@ const SpeakerPlot = ({ data }) => {
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
- Plot.tip(['The 41 most spoken languages cover 80% of all speakers.'], {
77
- x: 41,
78
- y: languages[40].cumSpeakers / 1e6
79
  })
80
  ]
81
  })
 
73
  textStrokeOpacity: 0,
74
  textFillOpacity: 0
75
  }),
76
+ Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
77
+ x: 40,
78
+ y: languages[39].cumSpeakers / 1e6
79
  })
80
  ]
81
  })
results.json CHANGED
The diff for this file is too large to render. See raw diff