David Pomerenke
commited on
Commit
·
47170a5
1
Parent(s):
276ec94
MMLU data loader for 3 parallel datasets
Browse files- evals/datasets_/mmlu.py +87 -17
- evals/tasks.py +8 -5
- uv.lock +1 -1
evals/datasets_/mmlu.py
CHANGED
@@ -1,20 +1,28 @@
|
|
|
|
|
|
|
|
1 |
from joblib.memory import Memory
|
2 |
-
from
|
3 |
from rich import print
|
4 |
-
|
5 |
-
from collections import defaultdict, Counter
|
6 |
cache = Memory(location=".cache", verbose=0).cache
|
7 |
|
|
|
8 |
@cache
|
9 |
def _get_dataset_config_names(dataset):
|
10 |
return get_dataset_config_names(dataset)
|
11 |
|
|
|
12 |
@cache
|
13 |
def _load_dataset(dataset, subset, **kwargs):
|
14 |
return load_dataset(dataset, subset, **kwargs)
|
15 |
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
|
19 |
def print_datasets_analysis():
|
20 |
print("Category counts and sample counts per dataset:")
|
@@ -24,7 +32,7 @@ def print_datasets_analysis():
|
|
24 |
langs1 = _get_dataset_config_names(slug1)
|
25 |
langs1 = [standardize_tag(a, macro=True) for a in langs1]
|
26 |
|
27 |
-
slug2 = "openai/MMMLU"
|
28 |
ds2 = _load_dataset(slug2, "FR_FR")
|
29 |
print_counts(slug2, [], ds2["test"]["Subject"])
|
30 |
langs2 = _get_dataset_config_names(slug2)
|
@@ -39,16 +47,27 @@ def print_datasets_analysis():
|
|
39 |
|
40 |
slug4 = "lighteval/okapi_mmlu"
|
41 |
ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
|
42 |
-
print_counts(
|
|
|
|
|
|
|
|
|
43 |
langs4 = _get_dataset_config_names(slug4)
|
44 |
|
45 |
-
|
46 |
slug5 = "Eurolingua/mmlux"
|
47 |
subsets = _get_dataset_config_names(slug5)
|
48 |
subjects = set(a.rsplit("_", 1)[0] for a in subsets)
|
49 |
-
rows_test = [
|
|
|
|
|
|
|
|
|
50 |
rows_test = [a.split("/")[0] for l in rows_test for a in l]
|
51 |
-
rows_dev = [
|
|
|
|
|
|
|
|
|
52 |
rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
|
53 |
print_counts(slug5, rows_dev, rows_test)
|
54 |
langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
|
@@ -70,21 +89,72 @@ def print_datasets_analysis():
|
|
70 |
print(len(set(langs)))
|
71 |
|
72 |
print("Datasets per language for languages that are not in Global-MMLU:")
|
73 |
-
print(
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
print(list(set(ds1["test"]["subject"])))
|
76 |
|
|
|
77 |
# based on this analysis:
|
78 |
# - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
|
79 |
# - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
|
80 |
|
81 |
# AfriMMLU is human-translated, but has only 5 task categories
|
82 |
-
# Global-MMLU is
|
83 |
# Okapi-MMLU is translated using ChatGPT (version unclear)
|
84 |
# MMLUX is translated using DeepL
|
85 |
-
# Therefore, the priority is: AfriMMLU, Global-MMLU, Okapi-MMLU
|
86 |
|
87 |
-
print_datasets_analysis()
|
88 |
|
89 |
-
def load_mmlu(language_bcp_47):
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter, defaultdict
|
2 |
+
import random
|
3 |
+
from datasets import get_dataset_config_names, load_dataset
|
4 |
from joblib.memory import Memory
|
5 |
+
from langcodes import Language, standardize_tag
|
6 |
from rich import print
|
7 |
+
|
|
|
8 |
cache = Memory(location=".cache", verbose=0).cache
|
9 |
|
10 |
+
|
11 |
@cache
|
12 |
def _get_dataset_config_names(dataset):
|
13 |
return get_dataset_config_names(dataset)
|
14 |
|
15 |
+
|
16 |
@cache
|
17 |
def _load_dataset(dataset, subset, **kwargs):
|
18 |
return load_dataset(dataset, subset, **kwargs)
|
19 |
|
20 |
+
|
21 |
+
def print_counts(slug, subjects_dev, subjects_test):
|
22 |
+
print(
|
23 |
+
f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
|
24 |
+
)
|
25 |
+
|
26 |
|
27 |
def print_datasets_analysis():
|
28 |
print("Category counts and sample counts per dataset:")
|
|
|
32 |
langs1 = _get_dataset_config_names(slug1)
|
33 |
langs1 = [standardize_tag(a, macro=True) for a in langs1]
|
34 |
|
35 |
+
slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
|
36 |
ds2 = _load_dataset(slug2, "FR_FR")
|
37 |
print_counts(slug2, [], ds2["test"]["Subject"])
|
38 |
langs2 = _get_dataset_config_names(slug2)
|
|
|
47 |
|
48 |
slug4 = "lighteval/okapi_mmlu"
|
49 |
ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
|
50 |
+
print_counts(
|
51 |
+
slug4,
|
52 |
+
[a.split("/")[0] for a in ds4["dev"]["id"]],
|
53 |
+
[a.split("/")[0] for a in ds4["test"]["id"]],
|
54 |
+
)
|
55 |
langs4 = _get_dataset_config_names(slug4)
|
56 |
|
|
|
57 |
slug5 = "Eurolingua/mmlux"
|
58 |
subsets = _get_dataset_config_names(slug5)
|
59 |
subjects = set(a.rsplit("_", 1)[0] for a in subsets)
|
60 |
+
rows_test = [
|
61 |
+
_load_dataset(slug5, subset)["test"]["id"]
|
62 |
+
for subset in subsets
|
63 |
+
if "_DA" in subset
|
64 |
+
]
|
65 |
rows_test = [a.split("/")[0] for l in rows_test for a in l]
|
66 |
+
rows_dev = [
|
67 |
+
_load_dataset(slug5, subset)["dev"]["id"]
|
68 |
+
for subset in subsets
|
69 |
+
if "_DA" in subset
|
70 |
+
]
|
71 |
rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
|
72 |
print_counts(slug5, rows_dev, rows_test)
|
73 |
langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
|
|
|
89 |
print(len(set(langs)))
|
90 |
|
91 |
print("Datasets per language for languages that are not in Global-MMLU:")
|
92 |
+
print(
|
93 |
+
sorted(
|
94 |
+
(lang, datasets)
|
95 |
+
for lang, datasets in lang_datasets.items()
|
96 |
+
if slug3 not in datasets
|
97 |
+
)
|
98 |
+
)
|
99 |
+
print(
|
100 |
+
Counter(
|
101 |
+
dataset
|
102 |
+
for ds_list in lang_datasets.values()
|
103 |
+
for dataset in ds_list
|
104 |
+
if slug3 not in ds_list
|
105 |
+
)
|
106 |
+
)
|
107 |
print(list(set(ds1["test"]["subject"])))
|
108 |
|
109 |
+
|
110 |
# based on this analysis:
|
111 |
# - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
|
112 |
# - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
|
113 |
|
114 |
# AfriMMLU is human-translated, but has only 5 task categories
|
115 |
+
# Global-MMLU is mixed-translated, specifically those 15 languages are that are also present in Global-MMLU-Lite, which are mostly from MMMLU; otherwise translated using Google Translate
|
116 |
# Okapi-MMLU is translated using ChatGPT (version unclear)
|
117 |
# MMLUX is translated using DeepL
|
118 |
+
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
119 |
|
120 |
+
# print_datasets_analysis()
|
121 |
|
122 |
+
def load_mmlu(language_bcp_47, i):
|
123 |
+
categories = sorted(list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"])))
|
124 |
+
category = categories[i % len(categories)]
|
125 |
+
random.seed(i)
|
126 |
+
j = random.randint(0, 100)
|
127 |
+
print(j)
|
128 |
+
tags_afrimmlu = {
|
129 |
+
standardize_tag(a, macro=True): a
|
130 |
+
for a in _get_dataset_config_names("masakhane/afrimmlu")
|
131 |
+
}
|
132 |
+
tags_global_mmlu = {
|
133 |
+
standardize_tag(a, macro=True): a
|
134 |
+
for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
|
135 |
+
}
|
136 |
+
tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
|
137 |
+
tags_mmlux = set(
|
138 |
+
a.rsplit("_", 1)[1].split("-")[0].lower()
|
139 |
+
for a in _get_dataset_config_names("Eurolingua/mmlux")
|
140 |
+
)
|
141 |
+
if language_bcp_47 in tags_afrimmlu:
|
142 |
+
ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
|
143 |
+
return ds["test"].filter(lambda x: x["subject"] == category)[j]
|
144 |
+
elif language_bcp_47 in tags_global_mmlu:
|
145 |
+
ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
|
146 |
+
def add_choices(split):
|
147 |
+
split["choices"] = list(zip([split["option_a"], split["option_b"], split["option_c"], split["option_d"]]))
|
148 |
+
return split
|
149 |
+
ds = ds.map(add_choices)
|
150 |
+
return ds["test"].filter(lambda x: x["subject"] == category)[j]
|
151 |
+
elif language_bcp_47 in tags_okapi:
|
152 |
+
ds = _load_dataset(
|
153 |
+
"lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
|
154 |
+
)
|
155 |
+
return ds["test"].filter(lambda x: x["id"] == f"{category}/test/{j}")[0]
|
156 |
+
elif language_bcp_47 in tags_mmlux:
|
157 |
+
# loading this is more complicated, todo
|
158 |
+
return None
|
159 |
+
else:
|
160 |
+
return None
|
evals/tasks.py
CHANGED
@@ -8,7 +8,7 @@ from datasets_.flores import flores_sentences
|
|
8 |
from joblib.memory import Memory
|
9 |
from languages import languages, script_name
|
10 |
from models import complete, transcribe
|
11 |
-
from datasets import load_dataset
|
12 |
|
13 |
cache = Memory(location=".cache", verbose=0).cache
|
14 |
bleu = evaluate.load("bleu")
|
@@ -186,13 +186,10 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
186 |
}
|
187 |
]
|
188 |
|
189 |
-
|
190 |
-
def _load_dataset(dataset, subset):
|
191 |
-
return load_dataset(dataset, subset)
|
192 |
|
193 |
@cache
|
194 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
195 |
-
data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
|
196 |
item = data["test"][nr]
|
197 |
def format_item(item):
|
198 |
return f"""{item['question']}
|
@@ -220,12 +217,18 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
220 |
"model": model,
|
221 |
"bcp_47": language_bcp_47,
|
222 |
"task": "mmlu",
|
|
|
223 |
"metric": "accuracy",
|
224 |
"score": acc,
|
225 |
"sentence_nr": nr,
|
226 |
}
|
227 |
]
|
228 |
|
|
|
|
|
|
|
|
|
|
|
229 |
@cache
|
230 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
231 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
|
|
8 |
from joblib.memory import Memory
|
9 |
from languages import languages, script_name
|
10 |
from models import complete, transcribe
|
11 |
+
from datasets import load_dataset, get_dataset_config_names
|
12 |
|
13 |
cache = Memory(location=".cache", verbose=0).cache
|
14 |
bleu = evaluate.load("bleu")
|
|
|
186 |
}
|
187 |
]
|
188 |
|
189 |
+
|
|
|
|
|
190 |
|
191 |
@cache
|
192 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
|
193 |
item = data["test"][nr]
|
194 |
def format_item(item):
|
195 |
return f"""{item['question']}
|
|
|
217 |
"model": model,
|
218 |
"bcp_47": language_bcp_47,
|
219 |
"task": "mmlu",
|
220 |
+
"dataset": ds,
|
221 |
"metric": "accuracy",
|
222 |
"score": acc,
|
223 |
"sentence_nr": nr,
|
224 |
}
|
225 |
]
|
226 |
|
227 |
+
from asyncio import run
|
228 |
+
results = run(mmlu_and_evaluate("gpt-4o-mini", "fr", 0))
|
229 |
+
print(results)
|
230 |
+
exit()
|
231 |
+
|
232 |
@cache
|
233 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
234 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
uv.lock
CHANGED
@@ -898,7 +898,7 @@ dev = [
|
|
898 |
{ name = "openai", specifier = ">=1.52.2" },
|
899 |
{ name = "protobuf", specifier = ">=5.28.3" },
|
900 |
{ name = "python-dotenv", specifier = ">=1.0.1" },
|
901 |
-
{ name = "rich" },
|
902 |
{ name = "sacrebleu", specifier = ">=2.4.3" },
|
903 |
{ name = "sentencepiece", specifier = ">=0.2.0" },
|
904 |
{ name = "tiktoken", specifier = ">=0.8.0" },
|
|
|
898 |
{ name = "openai", specifier = ">=1.52.2" },
|
899 |
{ name = "protobuf", specifier = ">=5.28.3" },
|
900 |
{ name = "python-dotenv", specifier = ">=1.0.1" },
|
901 |
+
{ name = "rich", specifier = ">=14.0.0" },
|
902 |
{ name = "sacrebleu", specifier = ">=2.4.3" },
|
903 |
{ name = "sentencepiece", specifier = ">=0.2.0" },
|
904 |
{ name = "tiktoken", specifier = ">=0.8.0" },
|