David Pomerenke commited on
Commit
47170a5
·
1 Parent(s): 276ec94

MMLU data loader for 3 parallel datasets

Browse files
Files changed (3) hide show
  1. evals/datasets_/mmlu.py +87 -17
  2. evals/tasks.py +8 -5
  3. uv.lock +1 -1
evals/datasets_/mmlu.py CHANGED
@@ -1,20 +1,28 @@
 
 
 
1
  from joblib.memory import Memory
2
- from datasets import load_dataset, get_dataset_config_names
3
  from rich import print
4
- from langcodes import standardize_tag, Language
5
- from collections import defaultdict, Counter
6
  cache = Memory(location=".cache", verbose=0).cache
7
 
 
8
  @cache
9
  def _get_dataset_config_names(dataset):
10
  return get_dataset_config_names(dataset)
11
 
 
12
  @cache
13
  def _load_dataset(dataset, subset, **kwargs):
14
  return load_dataset(dataset, subset, **kwargs)
15
 
16
- def print_counts(slug,subjects_dev, subjects_test):
17
- print(f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples")
 
 
 
 
18
 
19
  def print_datasets_analysis():
20
  print("Category counts and sample counts per dataset:")
@@ -24,7 +32,7 @@ def print_datasets_analysis():
24
  langs1 = _get_dataset_config_names(slug1)
25
  langs1 = [standardize_tag(a, macro=True) for a in langs1]
26
 
27
- slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
28
  ds2 = _load_dataset(slug2, "FR_FR")
29
  print_counts(slug2, [], ds2["test"]["Subject"])
30
  langs2 = _get_dataset_config_names(slug2)
@@ -39,16 +47,27 @@ def print_datasets_analysis():
39
 
40
  slug4 = "lighteval/okapi_mmlu"
41
  ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
42
- print_counts(slug4, [a.split("/")[0] for a in ds4["dev"]["id"]], [a.split("/")[0] for a in ds4["test"]["id"]])
 
 
 
 
43
  langs4 = _get_dataset_config_names(slug4)
44
 
45
-
46
  slug5 = "Eurolingua/mmlux"
47
  subsets = _get_dataset_config_names(slug5)
48
  subjects = set(a.rsplit("_", 1)[0] for a in subsets)
49
- rows_test = [_load_dataset(slug5, subset)["test"]["id"] for subset in subsets if "_DA" in subset]
 
 
 
 
50
  rows_test = [a.split("/")[0] for l in rows_test for a in l]
51
- rows_dev = [_load_dataset(slug5, subset)["dev"]["id"] for subset in subsets if "_DA" in subset]
 
 
 
 
52
  rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
53
  print_counts(slug5, rows_dev, rows_test)
54
  langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
@@ -70,21 +89,72 @@ def print_datasets_analysis():
70
  print(len(set(langs)))
71
 
72
  print("Datasets per language for languages that are not in Global-MMLU:")
73
- print(sorted((lang, datasets) for lang, datasets in lang_datasets.items() if slug3 not in datasets))
74
- print(Counter(dataset for ds_list in lang_datasets.values() for dataset in ds_list if slug3 not in ds_list))
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  print(list(set(ds1["test"]["subject"])))
76
 
 
77
  # based on this analysis:
78
  # - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
79
  # - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
80
 
81
  # AfriMMLU is human-translated, but has only 5 task categories
82
- # Global-MMLU is partially human-translated, specifically those 15 languages are that are also present in Global-MMLU-Lite, which are mostly from MMMLU; otherwise translated using Google Translate
83
  # Okapi-MMLU is translated using ChatGPT (version unclear)
84
  # MMLUX is translated using DeepL
85
- # Therefore, the priority is: AfriMMLU, Global-MMLU, Okapi-MMLU, MMLUX
86
 
87
- print_datasets_analysis()
88
 
89
- def load_mmlu(language_bcp_47):
90
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter, defaultdict
2
+ import random
3
+ from datasets import get_dataset_config_names, load_dataset
4
  from joblib.memory import Memory
5
+ from langcodes import Language, standardize_tag
6
  from rich import print
7
+
 
8
  cache = Memory(location=".cache", verbose=0).cache
9
 
10
+
11
  @cache
12
  def _get_dataset_config_names(dataset):
13
  return get_dataset_config_names(dataset)
14
 
15
+
16
  @cache
17
  def _load_dataset(dataset, subset, **kwargs):
18
  return load_dataset(dataset, subset, **kwargs)
19
 
20
+
21
+ def print_counts(slug, subjects_dev, subjects_test):
22
+ print(
23
+ f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
24
+ )
25
+
26
 
27
  def print_datasets_analysis():
28
  print("Category counts and sample counts per dataset:")
 
32
  langs1 = _get_dataset_config_names(slug1)
33
  langs1 = [standardize_tag(a, macro=True) for a in langs1]
34
 
35
+ slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
36
  ds2 = _load_dataset(slug2, "FR_FR")
37
  print_counts(slug2, [], ds2["test"]["Subject"])
38
  langs2 = _get_dataset_config_names(slug2)
 
47
 
48
  slug4 = "lighteval/okapi_mmlu"
49
  ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
50
+ print_counts(
51
+ slug4,
52
+ [a.split("/")[0] for a in ds4["dev"]["id"]],
53
+ [a.split("/")[0] for a in ds4["test"]["id"]],
54
+ )
55
  langs4 = _get_dataset_config_names(slug4)
56
 
 
57
  slug5 = "Eurolingua/mmlux"
58
  subsets = _get_dataset_config_names(slug5)
59
  subjects = set(a.rsplit("_", 1)[0] for a in subsets)
60
+ rows_test = [
61
+ _load_dataset(slug5, subset)["test"]["id"]
62
+ for subset in subsets
63
+ if "_DA" in subset
64
+ ]
65
  rows_test = [a.split("/")[0] for l in rows_test for a in l]
66
+ rows_dev = [
67
+ _load_dataset(slug5, subset)["dev"]["id"]
68
+ for subset in subsets
69
+ if "_DA" in subset
70
+ ]
71
  rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
72
  print_counts(slug5, rows_dev, rows_test)
73
  langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
 
89
  print(len(set(langs)))
90
 
91
  print("Datasets per language for languages that are not in Global-MMLU:")
92
+ print(
93
+ sorted(
94
+ (lang, datasets)
95
+ for lang, datasets in lang_datasets.items()
96
+ if slug3 not in datasets
97
+ )
98
+ )
99
+ print(
100
+ Counter(
101
+ dataset
102
+ for ds_list in lang_datasets.values()
103
+ for dataset in ds_list
104
+ if slug3 not in ds_list
105
+ )
106
+ )
107
  print(list(set(ds1["test"]["subject"])))
108
 
109
+
110
  # based on this analysis:
111
  # - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
112
  # - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
113
 
114
  # AfriMMLU is human-translated, but has only 5 task categories
115
+ # Global-MMLU is mixed-translated, specifically those 15 languages are that are also present in Global-MMLU-Lite, which are mostly from MMMLU; otherwise translated using Google Translate
116
  # Okapi-MMLU is translated using ChatGPT (version unclear)
117
  # MMLUX is translated using DeepL
118
+ # Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
119
 
120
+ # print_datasets_analysis()
121
 
122
+ def load_mmlu(language_bcp_47, i):
123
+ categories = sorted(list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"])))
124
+ category = categories[i % len(categories)]
125
+ random.seed(i)
126
+ j = random.randint(0, 100)
127
+ print(j)
128
+ tags_afrimmlu = {
129
+ standardize_tag(a, macro=True): a
130
+ for a in _get_dataset_config_names("masakhane/afrimmlu")
131
+ }
132
+ tags_global_mmlu = {
133
+ standardize_tag(a, macro=True): a
134
+ for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
135
+ }
136
+ tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
137
+ tags_mmlux = set(
138
+ a.rsplit("_", 1)[1].split("-")[0].lower()
139
+ for a in _get_dataset_config_names("Eurolingua/mmlux")
140
+ )
141
+ if language_bcp_47 in tags_afrimmlu:
142
+ ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
143
+ return ds["test"].filter(lambda x: x["subject"] == category)[j]
144
+ elif language_bcp_47 in tags_global_mmlu:
145
+ ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
146
+ def add_choices(split):
147
+ split["choices"] = list(zip([split["option_a"], split["option_b"], split["option_c"], split["option_d"]]))
148
+ return split
149
+ ds = ds.map(add_choices)
150
+ return ds["test"].filter(lambda x: x["subject"] == category)[j]
151
+ elif language_bcp_47 in tags_okapi:
152
+ ds = _load_dataset(
153
+ "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
154
+ )
155
+ return ds["test"].filter(lambda x: x["id"] == f"{category}/test/{j}")[0]
156
+ elif language_bcp_47 in tags_mmlux:
157
+ # loading this is more complicated, todo
158
+ return None
159
+ else:
160
+ return None
evals/tasks.py CHANGED
@@ -8,7 +8,7 @@ from datasets_.flores import flores_sentences
8
  from joblib.memory import Memory
9
  from languages import languages, script_name
10
  from models import complete, transcribe
11
- from datasets import load_dataset
12
 
13
  cache = Memory(location=".cache", verbose=0).cache
14
  bleu = evaluate.load("bleu")
@@ -186,13 +186,10 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
186
  }
187
  ]
188
 
189
- @cache
190
- def _load_dataset(dataset, subset):
191
- return load_dataset(dataset, subset)
192
 
193
  @cache
194
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
195
- data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
196
  item = data["test"][nr]
197
  def format_item(item):
198
  return f"""{item['question']}
@@ -220,12 +217,18 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
220
  "model": model,
221
  "bcp_47": language_bcp_47,
222
  "task": "mmlu",
 
223
  "metric": "accuracy",
224
  "score": acc,
225
  "sentence_nr": nr,
226
  }
227
  ]
228
 
 
 
 
 
 
229
  @cache
230
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
231
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
 
8
  from joblib.memory import Memory
9
  from languages import languages, script_name
10
  from models import complete, transcribe
11
+ from datasets import load_dataset, get_dataset_config_names
12
 
13
  cache = Memory(location=".cache", verbose=0).cache
14
  bleu = evaluate.load("bleu")
 
186
  }
187
  ]
188
 
189
+
 
 
190
 
191
  @cache
192
  async def mmlu_and_evaluate(model, language_bcp_47, nr):
 
193
  item = data["test"][nr]
194
  def format_item(item):
195
  return f"""{item['question']}
 
217
  "model": model,
218
  "bcp_47": language_bcp_47,
219
  "task": "mmlu",
220
+ "dataset": ds,
221
  "metric": "accuracy",
222
  "score": acc,
223
  "sentence_nr": nr,
224
  }
225
  ]
226
 
227
+ from asyncio import run
228
+ results = run(mmlu_and_evaluate("gpt-4o-mini", "fr", 0))
229
+ print(results)
230
+ exit()
231
+
232
  @cache
233
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
234
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
uv.lock CHANGED
@@ -898,7 +898,7 @@ dev = [
898
  { name = "openai", specifier = ">=1.52.2" },
899
  { name = "protobuf", specifier = ">=5.28.3" },
900
  { name = "python-dotenv", specifier = ">=1.0.1" },
901
- { name = "rich" },
902
  { name = "sacrebleu", specifier = ">=2.4.3" },
903
  { name = "sentencepiece", specifier = ">=0.2.0" },
904
  { name = "tiktoken", specifier = ">=0.8.0" },
 
898
  { name = "openai", specifier = ">=1.52.2" },
899
  { name = "protobuf", specifier = ">=5.28.3" },
900
  { name = "python-dotenv", specifier = ">=1.0.1" },
901
+ { name = "rich", specifier = ">=14.0.0" },
902
  { name = "sacrebleu", specifier = ">=2.4.3" },
903
  { name = "sentencepiece", specifier = ">=0.2.0" },
904
  { name = "tiktoken", specifier = ">=0.8.0" },