Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

David Pomerenke commited on Apr 18

Commit

47170a5

1 Parent(s): 276ec94

MMLU data loader for 3 parallel datasets

Browse files

Files changed (3) hide show

evals/datasets_/mmlu.py +87 -17
evals/tasks.py +8 -5
uv.lock +1 -1

evals/datasets_/mmlu.py CHANGED Viewed

@@ -1,20 +1,28 @@
 from joblib.memory import Memory
-from datasets import load_dataset, get_dataset_config_names
 from rich import print
-from langcodes import standardize_tag, Language
-from collections import defaultdict, Counter
 cache = Memory(location=".cache", verbose=0).cache
 @cache
 def _get_dataset_config_names(dataset):
     return get_dataset_config_names(dataset)
 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)
-def print_counts(slug,subjects_dev, subjects_test):
-    print(f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples")
 def print_datasets_analysis():
     print("Category counts and sample counts per dataset:")
@@ -24,7 +32,7 @@ def print_datasets_analysis():
     langs1 = _get_dataset_config_names(slug1)
     langs1 = [standardize_tag(a, macro=True) for a in langs1]
-    slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
     ds2 = _load_dataset(slug2, "FR_FR")
     print_counts(slug2, [], ds2["test"]["Subject"])
     langs2 = _get_dataset_config_names(slug2)
@@ -39,16 +47,27 @@ def print_datasets_analysis():
     slug4 = "lighteval/okapi_mmlu"
     ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
-    print_counts(slug4, [a.split("/")[0] for a in ds4["dev"]["id"]], [a.split("/")[0] for a in ds4["test"]["id"]])
     langs4 = _get_dataset_config_names(slug4)
     slug5 = "Eurolingua/mmlux"
     subsets = _get_dataset_config_names(slug5)
     subjects = set(a.rsplit("_", 1)[0] for a in subsets)
-    rows_test = [_load_dataset(slug5, subset)["test"]["id"] for subset in subsets if "_DA" in subset]
     rows_test = [a.split("/")[0] for l in rows_test for a in l]
-    rows_dev = [_load_dataset(slug5, subset)["dev"]["id"] for subset in subsets if "_DA" in subset]
     rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
     print_counts(slug5, rows_dev, rows_test)
     langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
@@ -70,21 +89,72 @@ def print_datasets_analysis():
     print(len(set(langs)))
     print("Datasets per language for languages that are not in Global-MMLU:")
-    print(sorted((lang, datasets) for lang, datasets in lang_datasets.items() if  slug3 not in datasets))
-    print(Counter(dataset for ds_list in lang_datasets.values() for dataset in ds_list if  slug3 not in ds_list))
     print(list(set(ds1["test"]["subject"])))
 # based on this analysis:
 # - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
 # - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
 # AfriMMLU is human-translated, but has only 5 task categories
-# Global-MMLU is partially human-translated, specifically those 15 languages are that are also present in Global-MMLU-Lite, which are mostly from MMMLU; otherwise translated using Google Translate
 # Okapi-MMLU is translated using ChatGPT (version unclear)
 # MMLUX is translated using DeepL
-# Therefore, the priority is: AfriMMLU, Global-MMLU, Okapi-MMLU, MMLUX
-print_datasets_analysis()
-def load_mmlu(language_bcp_47):
-    pass

+from collections import Counter, defaultdict
+import random
+from datasets import get_dataset_config_names, load_dataset
 from joblib.memory import Memory
+from langcodes import Language, standardize_tag
 from rich import print
 cache = Memory(location=".cache", verbose=0).cache
 @cache
 def _get_dataset_config_names(dataset):
     return get_dataset_config_names(dataset)
 @cache
 def _load_dataset(dataset, subset, **kwargs):
     return load_dataset(dataset, subset, **kwargs)
+def print_counts(slug, subjects_dev, subjects_test):
+    print(
+        f"{slug:<25} {len(list(set(subjects_test))):>3} test categories, {len(subjects_test):>6} samples, {len(list(set(subjects_dev))):>3} dev categories, {len(subjects_dev):>6} dev samples"
+    )
 def print_datasets_analysis():
     print("Category counts and sample counts per dataset:")
     langs1 = _get_dataset_config_names(slug1)
     langs1 = [standardize_tag(a, macro=True) for a in langs1]
+    slug2 = "openai/MMMLU"  # does not have dev set! – but: these languages are all also present in Global-MMLU
     ds2 = _load_dataset(slug2, "FR_FR")
     print_counts(slug2, [], ds2["test"]["Subject"])
     langs2 = _get_dataset_config_names(slug2)
     slug4 = "lighteval/okapi_mmlu"
     ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
+    print_counts(
+        slug4,
+        [a.split("/")[0] for a in ds4["dev"]["id"]],
+        [a.split("/")[0] for a in ds4["test"]["id"]],
+    )
     langs4 = _get_dataset_config_names(slug4)
     slug5 = "Eurolingua/mmlux"
     subsets = _get_dataset_config_names(slug5)
     subjects = set(a.rsplit("_", 1)[0] for a in subsets)
+    rows_test = [
+        _load_dataset(slug5, subset)["test"]["id"]
+        for subset in subsets
+        if "_DA" in subset
+    ]
     rows_test = [a.split("/")[0] for l in rows_test for a in l]
+    rows_dev = [
+        _load_dataset(slug5, subset)["dev"]["id"]
+        for subset in subsets
+        if "_DA" in subset
+    ]
     rows_dev = [a.split("/")[0] for l in rows_dev for a in l]
     print_counts(slug5, rows_dev, rows_test)
     langs5 = list(set(a.rsplit("_", 1)[1].split("-")[0].lower() for a in subsets))
     print(len(set(langs)))
     print("Datasets per language for languages that are not in Global-MMLU:")
+    print(
+        sorted(
+            (lang, datasets)
+            for lang, datasets in lang_datasets.items()
+            if slug3 not in datasets
+        )
+    )
+    print(
+        Counter(
+            dataset
+            for ds_list in lang_datasets.values()
+            for dataset in ds_list
+            if slug3 not in ds_list
+        )
+    )
     print(list(set(ds1["test"]["subject"])))
 # based on this analysis:
 # - we drop the OpenAI dataset, since it does not have a dev set, and since every language that it has is also present in Global-MMLU
 # - we stick to the 5 categories of the AfriMMLU dataset, since this is the most restricted dataset, and these 5 categories are present in all datasets, so this is good for comparability
 # AfriMMLU is human-translated, but has only 5 task categories
+# Global-MMLU is mixed-translated, specifically those 15 languages are that are also present in Global-MMLU-Lite, which are mostly from MMMLU; otherwise translated using Google Translate
 # Okapi-MMLU is translated using ChatGPT (version unclear)
 # MMLUX is translated using DeepL
+# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
+# print_datasets_analysis()
+def load_mmlu(language_bcp_47, i):
+    categories = sorted(list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"])))
+    category = categories[i % len(categories)]
+    random.seed(i)
+    j = random.randint(0, 100)
+    print(j)
+    tags_afrimmlu = {
+        standardize_tag(a, macro=True): a
+        for a in _get_dataset_config_names("masakhane/afrimmlu")
+    }
+    tags_global_mmlu = {
+        standardize_tag(a, macro=True): a
+        for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
+    }
+    tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
+    tags_mmlux = set(
+        a.rsplit("_", 1)[1].split("-")[0].lower()
+        for a in _get_dataset_config_names("Eurolingua/mmlux")
+    )
+    if language_bcp_47 in tags_afrimmlu:
+        ds = _load_dataset("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47])
+        return ds["test"].filter(lambda x: x["subject"] == category)[j]
+    elif language_bcp_47 in tags_global_mmlu:
+        ds = _load_dataset("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47])
+        def add_choices(split):
+            split["choices"] = list(zip([split["option_a"], split["option_b"], split["option_c"], split["option_d"]]))
+            return split
+        ds = ds.map(add_choices)
+        return ds["test"].filter(lambda x: x["subject"] == category)[j]
+    elif language_bcp_47 in tags_okapi:
+        ds = _load_dataset(
+            "lighteval/okapi_mmlu", language_bcp_47, trust_remote_code=True
+        )
+        return ds["test"].filter(lambda x: x["id"] == f"{category}/test/{j}")[0]
+    elif language_bcp_47 in tags_mmlux:
+        # loading this is more complicated, todo
+        return None
+    else:
+        return None

evals/tasks.py CHANGED Viewed

@@ -8,7 +8,7 @@ from datasets_.flores import flores_sentences
 from joblib.memory import Memory
 from languages import languages, script_name
 from models import complete, transcribe
-from datasets import load_dataset
 cache = Memory(location=".cache", verbose=0).cache
 bleu = evaluate.load("bleu")
@@ -186,13 +186,10 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
         }
     ]
-@cache
-def _load_dataset(dataset, subset):
-    return load_dataset(dataset, subset)
 @cache
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
-    data = _load_dataset("CohereForAI/Global-MMLU", language_bcp_47)
     item = data["test"][nr]
     def format_item(item):
         return f"""{item['question']}
@@ -220,12 +217,18 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
             "model": model,
             "bcp_47": language_bcp_47,
             "task": "mmlu",
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
 @cache
 async def transcribe_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]

 from joblib.memory import Memory
 from languages import languages, script_name
 from models import complete, transcribe
+from datasets import load_dataset, get_dataset_config_names
 cache = Memory(location=".cache", verbose=0).cache
 bleu = evaluate.load("bleu")
         }
     ]
 @cache
 async def mmlu_and_evaluate(model, language_bcp_47, nr):
     item = data["test"][nr]
     def format_item(item):
         return f"""{item['question']}
             "model": model,
             "bcp_47": language_bcp_47,
             "task": "mmlu",
+            "dataset": ds,
             "metric": "accuracy",
             "score": acc,
             "sentence_nr": nr,
         }
     ]
+from asyncio import run
+results = run(mmlu_and_evaluate("gpt-4o-mini", "fr", 0))
+print(results)
+exit()
 @cache
 async def transcribe_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]

uv.lock CHANGED Viewed

@@ -898,7 +898,7 @@ dev = [
     { name = "openai", specifier = ">=1.52.2" },
     { name = "protobuf", specifier = ">=5.28.3" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
-    { name = "rich" },
     { name = "sacrebleu", specifier = ">=2.4.3" },
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "tiktoken", specifier = ">=0.8.0" },

     { name = "openai", specifier = ">=1.52.2" },
     { name = "protobuf", specifier = ">=5.28.3" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
+    { name = "rich", specifier = ">=14.0.0" },
     { name = "sacrebleu", specifier = ">=2.4.3" },
     { name = "sentencepiece", specifier = ">=0.2.0" },
     { name = "tiktoken", specifier = ">=0.8.0" },