naazahrani commited on
Commit
1cf9767
·
verified ·
1 Parent(s): 54ea89a

Delete evaluation/ar

Browse files
evaluation/ar/acva_5_shot.json DELETED
@@ -1,119 +0,0 @@
1
- {
2
- "results": {
3
- "acva": {
4
- "alias": "acva",
5
- "acc,none": 0.7746268656716417,
6
- "acc_stderr,none": 0.004477269169728854,
7
- "acc_norm,none": 0.7632606199770379,
8
- "acc_norm_stderr,none": 0.004554991129754026
9
- }
10
- },
11
- "group_subtasks": {
12
- "acva": []
13
- },
14
- "configs": {
15
- "acva": {
16
- "task": "acva",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "FreedomIntelligence/ACVA-Arabic-Cultural-Value-Alignment",
21
- "dataset_kwargs": {
22
- "trust_remote_code": true
23
- },
24
- "test_split": "test",
25
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _format_subject(subject):\n \n arabic_words = subtasks_ar[subtasks.index(subject)]\n return arabic_words\n \n def _generate_subject(doc):\n subject = _format_subject(doc[\"id\"].split(\"-\")[0])\n\n return subject\n \n def _process_docs(doc):\n keys = [\"\u0635\u062d\",\n \"\u062e\u0637\u0623\"]\n subject = _generate_subject(doc)\n gold = keys.index(doc['answer'])\n out_doc = {\n \"id\": doc[\"id\"],\n \"query\": \"\\n\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644:\" + doc[\"question\"]+\"\\n\u0625\u062c\u0627\u0628\u0629:'\",\n \"choices\": keys,\n \"gold\": gold,\n \"subject\": subject,\n }\n \n return out_doc\n\n return dataset.map(_process_docs)\n",
26
- "doc_to_text": "query",
27
- "doc_to_target": "gold",
28
- "doc_to_choice": "choices",
29
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0639\u0628\u0627\u0631\u0627\u062a \u0625\u0645\u0627 \u0635\u062d\u064a\u062d\u0629 \u0623\u0648 \u062e\u0627\u0637\u0626\u0629 \u062d\u0648\u0644 {{subject}}\n \u0627\u0644\u0631\u062c\u0627\u0621 \u062a\u0635\u0646\u064a\u0641 \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0625\u0644\u0649 '\u0635\u062d' \u0623\u0648 '\u062e\u0637\u0623' \u062f\u0648\u0646 \u0634\u0631\u062d",
30
- "target_delimiter": " ",
31
- "fewshot_delimiter": "\n\n",
32
- "num_fewshot": 5,
33
- "metric_list": [
34
- {
35
- "metric": "acc",
36
- "aggregation": "mean",
37
- "higher_is_better": true
38
- },
39
- {
40
- "metric": "acc_norm",
41
- "aggregation": "mean",
42
- "higher_is_better": true
43
- }
44
- ],
45
- "output_type": "multiple_choice",
46
- "repeats": 1,
47
- "should_decontaminate": false,
48
- "metadata": {
49
- "version": 0.0
50
- }
51
- }
52
- },
53
- "versions": {
54
- "acva": 0.0
55
- },
56
- "n-shot": {
57
- "acva": 5
58
- },
59
- "higher_is_better": {
60
- "acva": {
61
- "acc": true,
62
- "acc_norm": true
63
- }
64
- },
65
- "n-samples": {
66
- "acva": {
67
- "original": 8710,
68
- "effective": 8710
69
- }
70
- },
71
- "config": {
72
- "model": "vllm",
73
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
74
- "batch_size": 1,
75
- "batch_sizes": [],
76
- "device": null,
77
- "use_cache": null,
78
- "limit": null,
79
- "bootstrap_iters": 100000,
80
- "gen_kwargs": null,
81
- "random_seed": 0,
82
- "numpy_seed": 1234,
83
- "torch_seed": 1234,
84
- "fewshot_seed": 1234
85
- },
86
- "git_hash": "8e1bd48d",
87
- "date": 1735662713.7617116,
88
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
89
- "transformers_version": "4.47.1",
90
- "upper_git_hash": null,
91
- "tokenizer_pad_token": [
92
- "<unk>",
93
- "0"
94
- ],
95
- "tokenizer_eos_token": [
96
- "</s>",
97
- "2"
98
- ],
99
- "tokenizer_bos_token": [
100
- "<s>",
101
- "1"
102
- ],
103
- "eot_token_id": 2,
104
- "max_length": 4096,
105
- "task_hashes": {
106
- "acva": "d007c508f0accdd697f549d7cbe7f960f1470c8f86f1a0969355a6ef33108edb"
107
- },
108
- "model_source": "vllm",
109
- "model_name": "/ALLaM-7B-Instruct",
110
- "model_name_sanitized": "/ALLaM-7B-Instruct",
111
- "system_instruction": null,
112
- "system_instruction_sha": null,
113
- "fewshot_as_multiturn": false,
114
- "chat_template": null,
115
- "chat_template_sha": null,
116
- "start_time": 3374.021232778,
117
- "end_time": 3578.563943596,
118
- "total_evaluation_time_seconds": "204.54271081800016"
119
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/araMath_v2_5_shot.json DELETED
@@ -1,123 +0,0 @@
1
- {
2
- "results": {
3
- "araMath_v2": {
4
- "alias": "araMath_v2",
5
- "acc,none": 0.655,
6
- "acc_stderr,none": 0.019423021295885703,
7
- "acc_norm,none": 0.655,
8
- "acc_norm_stderr,none": 0.019423021295885703
9
- }
10
- },
11
- "group_subtasks": {
12
- "araMath_v2": []
13
- },
14
- "configs": {
15
- "araMath_v2": {
16
- "task": "araMath_v2",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/araMath_v2",
21
- "dataset_name": "araMath_v2",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "test_split": "test",
26
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc):\n def remove_prefix(choice):\n prefixes = [\"(A)\", \"(B)\", \"(C)\", \"(D)\"]\n for prefix in prefixes:\n if choice.startswith(prefix + \" \"):\n return choice[len(prefix) + 1:] \n return choice \n\n def format_example(doc, keys):\n question = doc[\"question\"].strip()\n choices = \"\".join(\n [f\"{key}. {remove_prefix(choice)}\\n\" for key, choice in zip(keys, doc[\"options\"])]\n )\n\n prompt = f\"\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices}\\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n keys_en = [\"A\", \"B\", \"C\", \"D\"]\n out_doc = {\n \"query\": format_example(doc, keys_en),\n \"choices\": keys_en,\n \"gold\": keys_en.index(doc[\"label\"]),\n }\n return out_doc\n \n return dataset.map(_process_docs)\n",
27
- "doc_to_text": "query",
28
- "doc_to_target": "gold",
29
- "doc_to_choice": "{{choices}}",
30
- "description": "\u0645\u0646 \u0641\u0636\u0644\u0643 \u0627\u062e\u062a\u0631 \u0625\u062c\u0627\u0628\u0629 \u0648\u0627\u062d\u062f\u0629 \u0645\u0646 \u0628\u064a\u0646 'A\u060c B\u060c C\u060c D' \u062f\u0648\u0646 \u0634\u0631\u062d",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 5,
34
- "metric_list": [
35
- {
36
- "metric": "acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "acc_norm",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- }
45
- ],
46
- "output_type": "multiple_choice",
47
- "repeats": 1,
48
- "should_decontaminate": true,
49
- "doc_to_decontamination_query": "query",
50
- "metadata": {
51
- "version": 0.0
52
- }
53
- }
54
- },
55
- "versions": {
56
- "araMath_v2": 0.0
57
- },
58
- "n-shot": {
59
- "araMath_v2": 5
60
- },
61
- "higher_is_better": {
62
- "araMath_v2": {
63
- "acc": true,
64
- "acc_norm": true
65
- }
66
- },
67
- "n-samples": {
68
- "araMath_v2": {
69
- "original": 600,
70
- "effective": 600
71
- }
72
- },
73
- "config": {
74
- "model": "hf",
75
- "model_args": "parallelize=True,pretrained=/ALLaM-7B-Instruct,trust_remote_code=True",
76
- "model_num_parameters": 7000559616,
77
- "model_dtype": "torch.bfloat16",
78
- "model_revision": "main",
79
- "model_sha": "",
80
- "batch_size": 1,
81
- "batch_sizes": [],
82
- "device": null,
83
- "use_cache": null,
84
- "limit": null,
85
- "bootstrap_iters": 100000,
86
- "gen_kwargs": null,
87
- "random_seed": 0,
88
- "numpy_seed": 1234,
89
- "torch_seed": 1234,
90
- "fewshot_seed": 1234
91
- },
92
- "git_hash": "5e10e017",
93
- "date": 1736774062.2964265,
94
- "pretty_env_info": "PyTorch version: 2.1.0a0+29c30b1\nIs debug build: False\nCUDA used to build PyTorch: 12.2\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.86\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.22.2\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.1.0a0+29c30b1\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.16.0a0\n[pip3] triton==2.0.0.dev20221202\n[conda] Could not collect",
95
- "transformers_version": "4.38.2",
96
- "upper_git_hash": null,
97
- "tokenizer_pad_token": [
98
- "<unk>",
99
- "0"
100
- ],
101
- "tokenizer_eos_token": [
102
- "</s>",
103
- "2"
104
- ],
105
- "tokenizer_bos_token": [
106
- "<s>",
107
- "1"
108
- ],
109
- "eot_token_id": 2,
110
- "max_length": 4096,
111
- "task_hashes": {},
112
- "model_source": "hf",
113
- "model_name": "/ALLaM-7B-Instruct",
114
- "model_name_sanitized": "/ALLaM-7B-Instruct",
115
- "system_instruction": null,
116
- "system_instruction_sha": null,
117
- "fewshot_as_multiturn": false,
118
- "chat_template": null,
119
- "chat_template_sha": null,
120
- "start_time": 72495.638596469,
121
- "end_time": 72556.179139124,
122
- "total_evaluation_time_seconds": "60.54054265499872"
123
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/arabicmmlu_0_shot.json DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/ar/etec_0_shot.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "results": {
3
- "etec": {
4
- "alias": "etec",
5
- "acc,none": 0.6680761099365751,
6
- "acc_stderr,none": 0.010828952839616325,
7
- "acc_norm,none": 0.6680761099365751,
8
- "acc_norm_stderr,none": 0.010828952839616325
9
- }
10
- },
11
- "group_subtasks": {
12
- "etec": []
13
- },
14
- "configs": {
15
- "etec": {
16
- "task": "etec",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/etec",
21
- "dataset_name": "etec",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "test_split": "test",
26
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc):\n def format_example(doc, keys):\n question = doc[\"question\"].strip()\n \n choices = \"\".join(\n [f\"{key}. {choice}\\n\" for key, choice in zip(keys, doc[\"choices\"])]\n )\n prompt = f\"\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices}\\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n keys_ar = [\"\u0623\", \"\u0628\", \"\u062c\", \"\u062f\"]\n keys_en = [\"A\", \"B\", \"C\", \"D\"]\n out_doc = {\n \"query\": format_example(doc, keys_en),\n \"choices\": keys_en,\n \"gold\": keys_ar.index(doc[\"label\"]),\n }\n return out_doc\n \n return dataset.map(_process_docs)\n",
27
- "doc_to_text": "query",
28
- "doc_to_target": "gold",
29
- "doc_to_choice": "choices",
30
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0623\u0633\u0626\u0644\u0629 \u0627\u0644\u0627\u062e\u062a\u064a\u0627\u0631 \u0645\u0646 \u0645\u062a\u0639\u062f\u062f (\u0645\u0639 \u0627\u0644\u0625\u062c\u0627\u0628\u0627\u062a) \u0645\u0646 \u0641\u0636\u0644\u0643 \u0627\u062e\u062a\u0631 \u0625\u062c\u0627\u0628\u0629 \u0648\u0627\u062d\u062f\u0629 \u062f\u0648\u0646 \u0634\u0631\u062d\n ",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 0,
34
- "metric_list": [
35
- {
36
- "metric": "acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "acc_norm",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- }
45
- ],
46
- "output_type": "multiple_choice",
47
- "repeats": 1,
48
- "should_decontaminate": true,
49
- "doc_to_decontamination_query": "query",
50
- "metadata": {
51
- "version": 0.0
52
- }
53
- }
54
- },
55
- "versions": {
56
- "etec": 0.0
57
- },
58
- "n-shot": {
59
- "etec": 0
60
- },
61
- "higher_is_better": {
62
- "etec": {
63
- "acc": true,
64
- "acc_norm": true
65
- }
66
- },
67
- "n-samples": {
68
- "etec": {
69
- "original": 1892,
70
- "effective": 1892
71
- }
72
- },
73
- "config": {
74
- "model": "vllm",
75
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
76
- "batch_size": 1,
77
- "batch_sizes": [],
78
- "device": null,
79
- "use_cache": null,
80
- "limit": null,
81
- "bootstrap_iters": 100000,
82
- "gen_kwargs": null,
83
- "random_seed": 0,
84
- "numpy_seed": 1234,
85
- "torch_seed": 1234,
86
- "fewshot_seed": 1234
87
- },
88
- "git_hash": "8e1bd48d",
89
- "date": 1735662950.8344455,
90
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
91
- "transformers_version": "4.47.1",
92
- "upper_git_hash": null,
93
- "tokenizer_pad_token": [
94
- "<unk>",
95
- "0"
96
- ],
97
- "tokenizer_eos_token": [
98
- "</s>",
99
- "2"
100
- ],
101
- "tokenizer_bos_token": [
102
- "<s>",
103
- "1"
104
- ],
105
- "eot_token_id": 2,
106
- "max_length": 4096,
107
- "task_hashes": {
108
- "etec": "8937d87b09ed63604ed9f64a02b8ba75ee9c43b9acebd5dd58a797e187916bbf"
109
- },
110
- "model_source": "vllm",
111
- "model_name": "/ALLaM-7B-Instruct",
112
- "model_name_sanitized": "/ALLaM-7B-Instruct",
113
- "system_instruction": null,
114
- "system_instruction_sha": null,
115
- "fewshot_as_multiturn": false,
116
- "chat_template": null,
117
- "chat_template_sha": null,
118
- "start_time": 3611.154007204,
119
- "end_time": 3697.095375819,
120
- "total_evaluation_time_seconds": "85.94136861499965"
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/exams_ar_5_shot.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "results": {
3
- "exams_ar": {
4
- "alias": "exams_ar",
5
- "acc,none": 0.515828677839851,
6
- "acc_stderr,none": 0.021585885942816244,
7
- "acc_norm,none": 0.515828677839851,
8
- "acc_norm_stderr,none": 0.021585885942816244
9
- }
10
- },
11
- "group_subtasks": {
12
- "exams_ar": []
13
- },
14
- "configs": {
15
- "exams_ar": {
16
- "task": "exams_ar",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/exams_ar",
21
- "dataset_name": "exams_ar",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "test_split": "test",
26
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n\n def _process_docs(doc):\n def format_example(doc, keys):\n \"\"\"\n <prompt>\n \u0633\u0624\u0627\u0644:\n A. <choice1>\n B. <choice2>\n C. <choice3>\n D. <choice4>\n \u0627\u062c\u0627\u0628\u0629:\n \"\"\"\n \n question = doc[\"question\"].strip()\n \n choices = \"\".join(\n [f\"{key}. {choice}\\n\" for key, choice in zip(keys, doc[\"choices\"])]\n )\n prompt = f\"\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices} \\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n def _format_subject(subject):\n arabic_words = subtasks_ar[subtasks.index(subject)]\n return arabic_words\n\n keys = [\"A\", \"B\", \"C\", \"D\"]\n \n subject = doc['id'].split(\"-\")[0]\n description = f\"\ufed2\ufef4\ufee3\ufe8d \ufef2\ufee0\ufef3 \ufe84\ufeb4\ufe8c\ufedf\ufe93 \ufe8d\ufefc\ufea8\ufe98\ufef3\ufe8d\ufead \ufee2\ufee7 \ufee2\ufe98\ufecb\ufea9\ufea9 (\ufee2\ufecb \ufe8d\ufefa\ufe9f\ufe8e\ufe91\ufe8e\ufe97) \ufea1\ufeee\ufedf {_format_subject(subject)} \\n\" #\ufee2\ufee7 \ufed2\ufec0\ufee0\ufedb \ufe8e\ufea8\ufe97\ufead \ufe88\ufe9f\ufe8e\ufe91\ufe93 \ufeed\ufe8e\ufea3\ufea9\ufe93 \ufee2\ufee7 \ufe90\ufef4\ufee7 'A\u060c B\u060c C\u060c D' \ufea9\ufeee\ufee7 \ufeb5\ufeae\ufea3\\n\"\n\n out_doc = {\n \"idx\": doc[\"idx\"],\n \"id\": doc[\"id\"],\n 'dsecription': description,\n \"query\": format_example(doc, keys), # \"Question: \" + doc[\"question\"]['stem'] + \"\\nAnswer:\",\n \"choices\": keys,\n \"gold\": [\"A\", \"B\", \"C\", \"D\"].index(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_docs)\n",
27
- "doc_to_text": "query",
28
- "doc_to_target": "gold",
29
- "doc_to_choice": "choices",
30
- "description": "description",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 5,
34
- "metric_list": [
35
- {
36
- "metric": "acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "acc_norm",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- }
45
- ],
46
- "output_type": "multiple_choice",
47
- "repeats": 1,
48
- "should_decontaminate": true,
49
- "doc_to_decontamination_query": "query",
50
- "metadata": {
51
- "version": 0.0
52
- }
53
- }
54
- },
55
- "versions": {
56
- "exams_ar": 0.0
57
- },
58
- "n-shot": {
59
- "exams_ar": 5
60
- },
61
- "higher_is_better": {
62
- "exams_ar": {
63
- "acc": true,
64
- "acc_norm": true
65
- }
66
- },
67
- "n-samples": {
68
- "exams_ar": {
69
- "original": 537,
70
- "effective": 537
71
- }
72
- },
73
- "config": {
74
- "model": "vllm",
75
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
76
- "batch_size": 1,
77
- "batch_sizes": [],
78
- "device": null,
79
- "use_cache": null,
80
- "limit": null,
81
- "bootstrap_iters": 100000,
82
- "gen_kwargs": null,
83
- "random_seed": 0,
84
- "numpy_seed": 1234,
85
- "torch_seed": 1234,
86
- "fewshot_seed": 1234
87
- },
88
- "git_hash": "8e1bd48d",
89
- "date": 1735662207.0830526,
90
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
91
- "transformers_version": "4.47.1",
92
- "upper_git_hash": null,
93
- "tokenizer_pad_token": [
94
- "<unk>",
95
- "0"
96
- ],
97
- "tokenizer_eos_token": [
98
- "</s>",
99
- "2"
100
- ],
101
- "tokenizer_bos_token": [
102
- "<s>",
103
- "1"
104
- ],
105
- "eot_token_id": 2,
106
- "max_length": 4096,
107
- "task_hashes": {
108
- "exams_ar": "b1561abd56354d570ac16bf64163b0ee8dc6c507234b05f678576b09c26c644a"
109
- },
110
- "model_source": "vllm",
111
- "model_name": "/ALLaM-7B-Instruct",
112
- "model_name_sanitized": "/ALLaM-7B-Instruct",
113
- "system_instruction": null,
114
- "system_instruction_sha": null,
115
- "fewshot_as_multiturn": false,
116
- "chat_template": null,
117
- "chat_template_sha": null,
118
- "start_time": 2867.397536365,
119
- "end_time": 2948.510496752,
120
- "total_evaluation_time_seconds": "81.11296038699993"
121
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/gat_0_shot.json DELETED
@@ -1,549 +0,0 @@
1
- {
2
- "results": {
3
- "gat": {
4
- "acc,none": 0.4452527279568544,
5
- "acc_stderr,none": 0.0038711388833064567,
6
- "alias": "gat"
7
- },
8
- "gat_algebra": {
9
- "alias": " - gat_algebra",
10
- "acc,none": 0.40667903525046384,
11
- "acc_stderr,none": 0.009463939247454995
12
- },
13
- "gat_analogy": {
14
- "alias": " - gat_analogy",
15
- "acc,none": 0.35919854280510016,
16
- "acc_stderr,none": 0.009158766245747282
17
- },
18
- "gat_arithmetic": {
19
- "alias": " - gat_arithmetic",
20
- "acc,none": 0.40154582259845417,
21
- "acc_stderr,none": 0.009406284814832203
22
- },
23
- "gat_association": {
24
- "alias": " - gat_association",
25
- "acc,none": 0.5464114832535886,
26
- "acc_stderr,none": 0.015407801869520031
27
- },
28
- "gat_comparisons": {
29
- "alias": " - gat_comparisons",
30
- "acc,none": 0.34508196721311474,
31
- "acc_stderr,none": 0.013616100682624904
32
- },
33
- "gat_completion": {
34
- "alias": " - gat_completion",
35
- "acc,none": 0.6057851239669422,
36
- "acc_stderr,none": 0.014054411207805699
37
- },
38
- "gat_contextual": {
39
- "alias": " - gat_contextual",
40
- "acc,none": 0.3941717791411043,
41
- "acc_stderr,none": 0.013537713096332765
42
- },
43
- "gat_geometry": {
44
- "alias": " - gat_geometry",
45
- "acc,none": 0.473972602739726,
46
- "acc_stderr,none": 0.026171590093068537
47
- },
48
- "gat_reading": {
49
- "alias": " - gat_reading",
50
- "acc,none": 0.5727788279773157,
51
- "acc_stderr,none": 0.009620311542503682
52
- }
53
- },
54
- "groups": {
55
- "gat": {
56
- "acc,none": 0.4452527279568544,
57
- "acc_stderr,none": 0.0038711388833064567,
58
- "alias": "gat"
59
- }
60
- },
61
- "group_subtasks": {
62
- "gat": [
63
- "gat_analogy",
64
- "gat_association",
65
- "gat_completion",
66
- "gat_reading",
67
- "gat_algebra",
68
- "gat_arithmetic",
69
- "gat_comparisons",
70
- "gat_contextual",
71
- "gat_geometry"
72
- ]
73
- },
74
- "configs": {
75
- "gat_algebra": {
76
- "task": "gat_algebra",
77
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
78
- "dataset_name": "algebra",
79
- "dataset_kwargs": {
80
- "trust_remote_code": true
81
- },
82
- "test_split": "test",
83
- "fewshot_split": "validation",
84
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
85
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
86
- "doc_to_target": "{{label}}",
87
- "doc_to_choice": [
88
- "\u0623",
89
- "\u0628",
90
- "\u062c",
91
- "\u062f"
92
- ],
93
- "description": "",
94
- "target_delimiter": " ",
95
- "fewshot_delimiter": "\n\n",
96
- "num_fewshot": 0,
97
- "metric_list": [
98
- {
99
- "metric": "acc",
100
- "aggregation": "mean",
101
- "higher_is_better": true
102
- }
103
- ],
104
- "output_type": "multiple_choice",
105
- "repeats": 1,
106
- "should_decontaminate": false,
107
- "metadata": {
108
- "version": 0.0
109
- }
110
- },
111
- "gat_analogy": {
112
- "task": "gat_analogy",
113
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
114
- "dataset_name": "analogy",
115
- "dataset_kwargs": {
116
- "trust_remote_code": true
117
- },
118
- "test_split": "test",
119
- "fewshot_split": "validation",
120
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
121
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
122
- "doc_to_target": "{{label}}",
123
- "doc_to_choice": [
124
- "\u0623",
125
- "\u0628",
126
- "\u062c",
127
- "\u062f"
128
- ],
129
- "description": "",
130
- "target_delimiter": " ",
131
- "fewshot_delimiter": "\n\n",
132
- "num_fewshot": 0,
133
- "metric_list": [
134
- {
135
- "metric": "acc",
136
- "aggregation": "mean",
137
- "higher_is_better": true
138
- }
139
- ],
140
- "output_type": "multiple_choice",
141
- "repeats": 1,
142
- "should_decontaminate": false,
143
- "metadata": {
144
- "version": 0.0
145
- }
146
- },
147
- "gat_arithmetic": {
148
- "task": "gat_arithmetic",
149
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
150
- "dataset_name": "arithmetic",
151
- "dataset_kwargs": {
152
- "trust_remote_code": true
153
- },
154
- "test_split": "test",
155
- "fewshot_split": "validation",
156
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
157
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
158
- "doc_to_target": "{{label}}",
159
- "doc_to_choice": [
160
- "\u0623",
161
- "\u0628",
162
- "\u062c",
163
- "\u062f"
164
- ],
165
- "description": "",
166
- "target_delimiter": " ",
167
- "fewshot_delimiter": "\n\n",
168
- "num_fewshot": 0,
169
- "metric_list": [
170
- {
171
- "metric": "acc",
172
- "aggregation": "mean",
173
- "higher_is_better": true
174
- }
175
- ],
176
- "output_type": "multiple_choice",
177
- "repeats": 1,
178
- "should_decontaminate": false,
179
- "metadata": {
180
- "version": 0.0
181
- }
182
- },
183
- "gat_association": {
184
- "task": "gat_association",
185
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
186
- "dataset_name": "association",
187
- "dataset_kwargs": {
188
- "trust_remote_code": true
189
- },
190
- "test_split": "test",
191
- "fewshot_split": "validation",
192
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
193
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
194
- "doc_to_target": "{{label}}",
195
- "doc_to_choice": [
196
- "\u0623",
197
- "\u0628",
198
- "\u062c",
199
- "\u062f"
200
- ],
201
- "description": "",
202
- "target_delimiter": " ",
203
- "fewshot_delimiter": "\n\n",
204
- "num_fewshot": 0,
205
- "metric_list": [
206
- {
207
- "metric": "acc",
208
- "aggregation": "mean",
209
- "higher_is_better": true
210
- }
211
- ],
212
- "output_type": "multiple_choice",
213
- "repeats": 1,
214
- "should_decontaminate": false,
215
- "metadata": {
216
- "version": 0.0
217
- }
218
- },
219
- "gat_comparisons": {
220
- "task": "gat_comparisons",
221
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
222
- "dataset_name": "comparisons",
223
- "dataset_kwargs": {
224
- "trust_remote_code": true
225
- },
226
- "test_split": "test",
227
- "fewshot_split": "validation",
228
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
229
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
230
- "doc_to_target": "{{label}}",
231
- "doc_to_choice": [
232
- "\u0623",
233
- "\u0628",
234
- "\u062c",
235
- "\u062f"
236
- ],
237
- "description": "",
238
- "target_delimiter": " ",
239
- "fewshot_delimiter": "\n\n",
240
- "num_fewshot": 0,
241
- "metric_list": [
242
- {
243
- "metric": "acc",
244
- "aggregation": "mean",
245
- "higher_is_better": true
246
- }
247
- ],
248
- "output_type": "multiple_choice",
249
- "repeats": 1,
250
- "should_decontaminate": false,
251
- "metadata": {
252
- "version": 0.0
253
- }
254
- },
255
- "gat_completion": {
256
- "task": "gat_completion",
257
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
258
- "dataset_name": "completion",
259
- "dataset_kwargs": {
260
- "trust_remote_code": true
261
- },
262
- "test_split": "test",
263
- "fewshot_split": "validation",
264
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
265
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
266
- "doc_to_target": "{{label}}",
267
- "doc_to_choice": [
268
- "\u0623",
269
- "\u0628",
270
- "\u062c",
271
- "\u062f"
272
- ],
273
- "description": "",
274
- "target_delimiter": " ",
275
- "fewshot_delimiter": "\n\n",
276
- "num_fewshot": 0,
277
- "metric_list": [
278
- {
279
- "metric": "acc",
280
- "aggregation": "mean",
281
- "higher_is_better": true
282
- }
283
- ],
284
- "output_type": "multiple_choice",
285
- "repeats": 1,
286
- "should_decontaminate": false,
287
- "metadata": {
288
- "version": 0.0
289
- }
290
- },
291
- "gat_contextual": {
292
- "task": "gat_contextual",
293
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
294
- "dataset_name": "contextual",
295
- "dataset_kwargs": {
296
- "trust_remote_code": true
297
- },
298
- "test_split": "test",
299
- "fewshot_split": "validation",
300
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
301
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
302
- "doc_to_target": "{{label}}",
303
- "doc_to_choice": [
304
- "\u0623",
305
- "\u0628",
306
- "\u062c",
307
- "\u062f"
308
- ],
309
- "description": "\u0627\u0648\u062c\u062f \u0627\u0644\u062e\u0637\u0623 \u0627\u0644\u0633\u064a\u0627\u0642\u064a \u0641\u064a \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0627\u0644\u062a\u0627\u0644\u064a\u0629 \u0645\u0646 \u0628\u064a\u0646 \u0627\u0644\u062e\u064a\u0627\u0631\u0627\u062a:",
310
- "target_delimiter": " ",
311
- "fewshot_delimiter": "\n\n",
312
- "num_fewshot": 0,
313
- "metric_list": [
314
- {
315
- "metric": "acc",
316
- "aggregation": "mean",
317
- "higher_is_better": true
318
- }
319
- ],
320
- "output_type": "multiple_choice",
321
- "repeats": 1,
322
- "should_decontaminate": false,
323
- "metadata": {
324
- "version": 0.0
325
- }
326
- },
327
- "gat_geometry": {
328
- "task": "gat_geometry",
329
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
330
- "dataset_name": "geometry",
331
- "dataset_kwargs": {
332
- "trust_remote_code": true
333
- },
334
- "test_split": "test",
335
- "fewshot_split": "validation",
336
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
337
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
338
- "doc_to_target": "{{label}}",
339
- "doc_to_choice": [
340
- "\u0623",
341
- "\u0628",
342
- "\u062c",
343
- "\u062f"
344
- ],
345
- "description": "",
346
- "target_delimiter": " ",
347
- "fewshot_delimiter": "\n\n",
348
- "num_fewshot": 0,
349
- "metric_list": [
350
- {
351
- "metric": "acc",
352
- "aggregation": "mean",
353
- "higher_is_better": true
354
- }
355
- ],
356
- "output_type": "multiple_choice",
357
- "repeats": 1,
358
- "should_decontaminate": false,
359
- "metadata": {
360
- "version": 0.0
361
- }
362
- },
363
- "gat_reading": {
364
- "task": "gat_reading",
365
- "dataset_path": "lm_eval/tasks/gat/gat_data/gat.py",
366
- "dataset_name": "reading",
367
- "dataset_kwargs": {
368
- "trust_remote_code": true
369
- },
370
- "test_split": "test",
371
- "fewshot_split": "validation",
372
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n # def _process_doc(doc):\n \n # subject = doc['id'].split(\"-\")[0]\n # subject_ar = subtasks_ar[subtasks.index(subject)]\n # out_doc = {**doc, 'subject_ar': subject_ar}\n # print(subject_ar)\n # print(out_doc)\n # return out_doc\n\n return dataset\n",
373
- "doc_to_text": "{{question}}\n\u0623. {{choices[0]}}\n\u0628. {{choices[1]}}\n\u062c. {{choices[2]}}\n\u062f. {{choices[3]}}\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:",
374
- "doc_to_target": "{{label}}",
375
- "doc_to_choice": [
376
- "\u0623",
377
- "\u0628",
378
- "\u062c",
379
- "\u062f"
380
- ],
381
- "description": "",
382
- "target_delimiter": " ",
383
- "fewshot_delimiter": "\n\n",
384
- "num_fewshot": 0,
385
- "metric_list": [
386
- {
387
- "metric": "acc",
388
- "aggregation": "mean",
389
- "higher_is_better": true
390
- }
391
- ],
392
- "output_type": "multiple_choice",
393
- "repeats": 1,
394
- "should_decontaminate": false,
395
- "metadata": {
396
- "version": 0.0
397
- }
398
- }
399
- },
400
- "versions": {
401
- "gat": 0,
402
- "gat_algebra": 0.0,
403
- "gat_analogy": 0.0,
404
- "gat_arithmetic": 0.0,
405
- "gat_association": 0.0,
406
- "gat_comparisons": 0.0,
407
- "gat_completion": 0.0,
408
- "gat_contextual": 0.0,
409
- "gat_geometry": 0.0,
410
- "gat_reading": 0.0
411
- },
412
- "n-shot": {
413
- "gat_algebra": 0,
414
- "gat_analogy": 0,
415
- "gat_arithmetic": 0,
416
- "gat_association": 0,
417
- "gat_comparisons": 0,
418
- "gat_completion": 0,
419
- "gat_contextual": 0,
420
- "gat_geometry": 0,
421
- "gat_reading": 0
422
- },
423
- "higher_is_better": {
424
- "gat": {
425
- "acc": true
426
- },
427
- "gat_algebra": {
428
- "acc": true
429
- },
430
- "gat_analogy": {
431
- "acc": true
432
- },
433
- "gat_arithmetic": {
434
- "acc": true
435
- },
436
- "gat_association": {
437
- "acc": true
438
- },
439
- "gat_comparisons": {
440
- "acc": true
441
- },
442
- "gat_completion": {
443
- "acc": true
444
- },
445
- "gat_contextual": {
446
- "acc": true
447
- },
448
- "gat_geometry": {
449
- "acc": true
450
- },
451
- "gat_reading": {
452
- "acc": true
453
- }
454
- },
455
- "n-samples": {
456
- "gat_analogy": {
457
- "original": 2745,
458
- "effective": 2745
459
- },
460
- "gat_association": {
461
- "original": 1045,
462
- "effective": 1045
463
- },
464
- "gat_completion": {
465
- "original": 1210,
466
- "effective": 1210
467
- },
468
- "gat_reading": {
469
- "original": 2645,
470
- "effective": 2645
471
- },
472
- "gat_algebra": {
473
- "original": 2695,
474
- "effective": 2695
475
- },
476
- "gat_arithmetic": {
477
- "original": 2717,
478
- "effective": 2717
479
- },
480
- "gat_comparisons": {
481
- "original": 1220,
482
- "effective": 1220
483
- },
484
- "gat_contextual": {
485
- "original": 1304,
486
- "effective": 1304
487
- },
488
- "gat_geometry": {
489
- "original": 365,
490
- "effective": 365
491
- }
492
- },
493
- "config": {
494
- "model": "vllm",
495
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
496
- "batch_size": 1,
497
- "batch_sizes": [],
498
- "device": null,
499
- "use_cache": null,
500
- "limit": null,
501
- "bootstrap_iters": 100000,
502
- "gen_kwargs": null,
503
- "random_seed": 0,
504
- "numpy_seed": 1234,
505
- "torch_seed": 1234,
506
- "fewshot_seed": 1234
507
- },
508
- "git_hash": "8e1bd48d",
509
- "date": 1735664096.2650902,
510
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
511
- "transformers_version": "4.47.1",
512
- "upper_git_hash": null,
513
- "tokenizer_pad_token": [
514
- "<unk>",
515
- "0"
516
- ],
517
- "tokenizer_eos_token": [
518
- "</s>",
519
- "2"
520
- ],
521
- "tokenizer_bos_token": [
522
- "<s>",
523
- "1"
524
- ],
525
- "eot_token_id": 2,
526
- "max_length": 4096,
527
- "task_hashes": {
528
- "gat_analogy": "ede28dec097bfebe8a85a19fa27d001696858276df66254bdb70fc63231f1a83",
529
- "gat_association": "5d82550d46c4f3cabf370185a8a23cc2eb5b08f1f0c5e210a8a712562a44bd08",
530
- "gat_completion": "fc3c19dd7f1896696fec1bffc21182804c9b2f1fb8d8c882428a6bb4bb61e370",
531
- "gat_reading": "93053b187a750d2e87f5488f2d0fda944f3da9195bb04d1c4dee9c4b56fa626a",
532
- "gat_algebra": "77832c595eaaf156775c3dbb27da0915ef600ebf46a7113ae32a202b0359e8a6",
533
- "gat_arithmetic": "6a498f75f5cc0ffd1b30f7a6293ba80d08f2a8876d5558d8e934bf57355ff0cc",
534
- "gat_comparisons": "acb80c0ed8dd07e916a471189aef3a546efc289824b2cc50a32c11dc4c97c9c1",
535
- "gat_contextual": "de063ed3b94011d74ee24a6532122c9d344fc15e42800db44f0849995a0bc37a",
536
- "gat_geometry": "3e482885559a4404ee9e97556edc6e49959770a499f4ae2c58f18ad85b91a363"
537
- },
538
- "model_source": "vllm",
539
- "model_name": "/ALLaM-7B-Instruct",
540
- "model_name_sanitized": "/ALLaM-7B-Instruct",
541
- "system_instruction": null,
542
- "system_instruction_sha": null,
543
- "fewshot_as_multiturn": false,
544
- "chat_template": null,
545
- "chat_template_sha": null,
546
- "start_time": 4756.376698655,
547
- "end_time": 5124.76942052,
548
- "total_evaluation_time_seconds": "368.39272186499966"
549
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/moe_ien_mcq_0_shot.json DELETED
@@ -1,118 +0,0 @@
1
- {
2
- "results": {
3
- "moe_ien_mcq": {
4
- "alias": "moe_ien_mcq",
5
- "acc,none": 0.9154154154154154,
6
- "acc_stderr,none": 0.0027841569543517694,
7
- "acc_norm,none": 0.9154154154154154,
8
- "acc_norm_stderr,none": 0.0027841569543517694
9
- }
10
- },
11
- "group_subtasks": {
12
- "moe_ien_mcq": []
13
- },
14
- "configs": {
15
- "moe_ien_mcq": {
16
- "task": "moe_ien_mcq",
17
- "dataset_path": "lm_eval/tasks/moe_ien_mcq/ien_moe_mcq.py",
18
- "dataset_name": "moe_ien_mcq",
19
- "dataset_kwargs": {
20
- "trust_remote_code": true
21
- },
22
- "test_split": "test",
23
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc): \n def remove_prefix(choice):\n return choice.split(\". \", 1)[1] if \". \" in choice else choice\n\n def format_example(doc, keys):\n question = doc[\"Question\"].strip()\n \n choices = \"\".join(\n [f\"{key}. {remove_prefix(choice)}\\n\" for key, choice in zip(keys, doc[\"Choices\"])]\n \n )\n prompt = f\"\\n\\n\u0633\u0624\u0627\u0644: {question}\\n{choices} \\n\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n keys = [\"A\", \"B\", \"C\", \"D\", \"E\", \"F\"][0:len(doc[\"Choices\"])]\n out_doc = {\n \"Query\": format_example(doc, keys), \n \"Choices\": keys,\n \"gold\": int(doc[\"Answer\"])-1, ## \n \"Speciality\": doc['Speciality']\n } \n return out_doc\n \n return dataset.map(_process_docs)\n",
24
- "doc_to_text": "Query",
25
- "doc_to_target": "gold",
26
- "doc_to_choice": "{{Choices}}",
27
- "description": "\u0641\u064a\u0645\u0627\u202f\u064a\u0644\u064a\u202f\u0623\u0633\u0626\u0644\u0629\u202f\u0627\u0644\u0627\u062e\u062a\u064a\u0627\u0631\u202f\u0645\u0646\u202f\u0645\u062a\u0639\u062f\u062f\u202f(\u0645\u0639\u202f\u0627\u0644\u0625\u062c\u0627\u0628\u0627\u062a)\u202f\u0641\u064a\u202f{{Speciality}}",
28
- "target_delimiter": " ",
29
- "fewshot_delimiter": "\n\n",
30
- "num_fewshot": 0,
31
- "metric_list": [
32
- {
33
- "metric": "acc",
34
- "aggregation": "mean",
35
- "higher_is_better": true
36
- },
37
- {
38
- "metric": "acc_norm",
39
- "aggregation": "mean",
40
- "higher_is_better": true
41
- }
42
- ],
43
- "output_type": "multiple_choice",
44
- "repeats": 1,
45
- "should_decontaminate": true,
46
- "doc_to_decontamination_query": "Query",
47
- "metadata": {
48
- "version": 0.0
49
- }
50
- }
51
- },
52
- "versions": {
53
- "moe_ien_mcq": 0.0
54
- },
55
- "n-shot": {
56
- "moe_ien_mcq": 0
57
- },
58
- "higher_is_better": {
59
- "moe_ien_mcq": {
60
- "acc": true,
61
- "acc_norm": true
62
- }
63
- },
64
- "n-samples": {
65
- "moe_ien_mcq": {
66
- "original": 9990,
67
- "effective": 9990
68
- }
69
- },
70
- "config": {
71
- "model": "vllm",
72
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
73
- "batch_size": 1,
74
- "batch_sizes": [],
75
- "device": null,
76
- "use_cache": null,
77
- "limit": null,
78
- "bootstrap_iters": 100000,
79
- "gen_kwargs": null,
80
- "random_seed": 0,
81
- "numpy_seed": 1234,
82
- "torch_seed": 1234,
83
- "fewshot_seed": 1234
84
- },
85
- "git_hash": "8e1bd48d",
86
- "date": 1735663068.5370116,
87
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
88
- "transformers_version": "4.47.1",
89
- "upper_git_hash": null,
90
- "tokenizer_pad_token": [
91
- "<unk>",
92
- "0"
93
- ],
94
- "tokenizer_eos_token": [
95
- "</s>",
96
- "2"
97
- ],
98
- "tokenizer_bos_token": [
99
- "<s>",
100
- "1"
101
- ],
102
- "eot_token_id": 2,
103
- "max_length": 4096,
104
- "task_hashes": {
105
- "moe_ien_mcq": "554899322e5b78369683b10024d90dc868f768d310530589a6167541e8f9d594"
106
- },
107
- "model_source": "vllm",
108
- "model_name": "/ALLaM-7B-Instruct",
109
- "model_name_sanitized": "/ALLaM-7B-Instruct",
110
- "system_instruction": null,
111
- "system_instruction_sha": null,
112
- "fewshot_as_multiturn": false,
113
- "chat_template": null,
114
- "chat_template_sha": null,
115
- "start_time": 3728.910211786,
116
- "end_time": 3947.718352837,
117
- "total_evaluation_time_seconds": "218.8081410509999"
118
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/moe_ien_tf_0_shot.json DELETED
@@ -1,119 +0,0 @@
1
- {
2
- "results": {
3
- "moe_ien_tf": {
4
- "alias": "moe_ien_tf",
5
- "acc,none": 0.8557082967729356,
6
- "acc_stderr,none": 0.0034697209254064324,
7
- "acc_norm,none": 0.8557082967729356,
8
- "acc_norm_stderr,none": 0.0034697209254064324
9
- }
10
- },
11
- "group_subtasks": {
12
- "moe_ien_tf": []
13
- },
14
- "configs": {
15
- "moe_ien_tf": {
16
- "task": "moe_ien_tf",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/moe_ien_tf",
21
- "dataset_kwargs": {
22
- "trust_remote_code": true
23
- },
24
- "test_split": "test",
25
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc):\n keys=[\"\u0635\u062d\u064a\u062d\u0629\",\n \"\u062e\u0627\u0637\u0626\u0629\"\n ]\n #keys =[\"\u0635\u0648\u0627\u0628\",\n # \"\u062e\u0637\u0623\"]\n target_key = int(doc[\"Answer\"])-1\n\n out_doc = {\n \"query\": \"\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644:\" +doc[\"Question\"]+\"\\n\u0625\u062c\u0627\u0628\u0629:'\", \n \"choices\": keys,\n \"gold\": target_key,\n }\n return out_doc\n return dataset.map(_process_docs)\n",
26
- "doc_to_text": "query",
27
- "doc_to_target": "gold",
28
- "doc_to_choice": "choices",
29
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0639\u0628\u0627\u0631\u0627\u062a \u0625\u0645\u0627 \u0635\u062d\u064a\u062d\u0629 \u0623\u0648 \u062e\u0627\u0637\u0626\u0629 \u062d\u0648\u0644 {{Speciality}}\n \u0627\u0644\u0631\u062c\u0627\u0621 \u062a\u0635\u0646\u064a\u0641 \u0627\u0644\u0639\u0628\u0627\u0631\u0629 \u0625\u0644\u0649 '\u0635\u062d\u064a\u062d\u0629' \u0623\u0648 '\u062e\u0627\u0637\u0626\u0629' \u062f\u0648\u0646 \u0634\u0631\u062d ",
30
- "target_delimiter": " ",
31
- "fewshot_delimiter": "\n\n",
32
- "num_fewshot": 0,
33
- "metric_list": [
34
- {
35
- "metric": "acc",
36
- "aggregation": "mean",
37
- "higher_is_better": true
38
- },
39
- {
40
- "metric": "acc_norm",
41
- "aggregation": "mean",
42
- "higher_is_better": true
43
- }
44
- ],
45
- "output_type": "multiple_choice",
46
- "repeats": 1,
47
- "should_decontaminate": false,
48
- "metadata": {
49
- "version": 0.0
50
- }
51
- }
52
- },
53
- "versions": {
54
- "moe_ien_tf": 0.0
55
- },
56
- "n-shot": {
57
- "moe_ien_tf": 0
58
- },
59
- "higher_is_better": {
60
- "moe_ien_tf": {
61
- "acc": true,
62
- "acc_norm": true
63
- }
64
- },
65
- "n-samples": {
66
- "moe_ien_tf": {
67
- "original": 10257,
68
- "effective": 10257
69
- }
70
- },
71
- "config": {
72
- "model": "vllm",
73
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
74
- "batch_size": 1,
75
- "batch_sizes": [],
76
- "device": null,
77
- "use_cache": null,
78
- "limit": null,
79
- "bootstrap_iters": 100000,
80
- "gen_kwargs": null,
81
- "random_seed": 0,
82
- "numpy_seed": 1234,
83
- "torch_seed": 1234,
84
- "fewshot_seed": 1234
85
- },
86
- "git_hash": "8e1bd48d",
87
- "date": 1735663321.6141305,
88
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
89
- "transformers_version": "4.47.1",
90
- "upper_git_hash": null,
91
- "tokenizer_pad_token": [
92
- "<unk>",
93
- "0"
94
- ],
95
- "tokenizer_eos_token": [
96
- "</s>",
97
- "2"
98
- ],
99
- "tokenizer_bos_token": [
100
- "<s>",
101
- "1"
102
- ],
103
- "eot_token_id": 2,
104
- "max_length": 4096,
105
- "task_hashes": {
106
- "moe_ien_tf": "bf29d6fb290755a9dc7c5aaf1263e4cd1e9d82a62085aa6279661d8b84fd5ab6"
107
- },
108
- "model_source": "vllm",
109
- "model_name": "/ALLaM-7B-Instruct",
110
- "model_name_sanitized": "/ALLaM-7B-Instruct",
111
- "system_instruction": null,
112
- "system_instruction_sha": null,
113
- "fewshot_as_multiturn": false,
114
- "chat_template": null,
115
- "chat_template_sha": null,
116
- "start_time": 3981.83990155,
117
- "end_time": 4097.740745391,
118
- "total_evaluation_time_seconds": "115.9008438410001"
119
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/ar/openaimmlu_0_shot.json DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/ar/sdaia_mcq_0_shot.json DELETED
@@ -1,121 +0,0 @@
1
- {
2
- "results": {
3
- "sdaia_mcq": {
4
- "alias": "sdaia_mcq",
5
- "acc,none": 0.739021329987453,
6
- "acc_stderr,none": 0.011003303841318535,
7
- "acc_norm,none": 0.739021329987453,
8
- "acc_norm_stderr,none": 0.011003303841318535
9
- }
10
- },
11
- "group_subtasks": {
12
- "sdaia_mcq": []
13
- },
14
- "configs": {
15
- "sdaia_mcq": {
16
- "task": "sdaia_mcq",
17
- "tag": [
18
- "multiple_choice"
19
- ],
20
- "dataset_path": "lm_eval/tasks/sdaia_mcq/sdaia_mcq.py",
21
- "dataset_name": "sdaia_mcq",
22
- "dataset_kwargs": {
23
- "trust_remote_code": true
24
- },
25
- "test_split": "test",
26
- "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_docs(doc): \n def remove_prefix(choice):\n return choice.replace('.', '') if '.' in choice[:2] else choice\n \n def format_example(doc, keys):\n question = doc[\"question\"].strip()\n \n choice_num = ['choice1', 'choice2', 'choice3', 'choice4']\n choices = \"\".join(\n [f\"{key}. {remove_prefix(doc[choice_num[index]])}\\n\" for index, key in enumerate(keys)]\n )\n\n prompt = f\"\\n\\n\\n\u0627\u0644\u0633\u0624\u0627\u0644: {question}\\n{choices} \\n\u0627\u0644\u0627\u062c\u0627\u0628\u0629:\"\n return prompt\n\n #keys = [\"1\", \"2\", \"3\", \"4\"]\n keys = [\"A\", \"B\", \"C\", \"D\"]\n out_doc = {\n \"query\": format_example(doc, keys), \n \"choices\": keys,\n \"gold\": doc[\"answer\"]-1,\n } \n\n return out_doc\n \n return dataset.map(_process_docs)\n",
27
- "doc_to_text": "query",
28
- "doc_to_target": "gold",
29
- "doc_to_choice": "{{choices}}",
30
- "description": "\u0641\u064a\u0645\u0627 \u064a\u0644\u064a \u0623\u0633\u0626\u0644\u0629 \u0627\u0644\u0627\u062e\u062a\u064a\u0627\u0631 \u0645\u0646 \u0645\u062a\u0639\u062f\u062f (\u0645\u0639 \u0627\u0644\u0625\u062c\u0627\u0628\u0627\u062a) \u0645\u0646 \u0641\u0636\u0644\u0643 \u0627\u062e\u062a\u0631 \u0625\u062c\u0627\u0628\u0629 \u0648\u0627\u062d\u062f\u0629 \u062f\u0648\u0646 \u0634\u0631\u062d",
31
- "target_delimiter": " ",
32
- "fewshot_delimiter": "\n\n",
33
- "num_fewshot": 0,
34
- "metric_list": [
35
- {
36
- "metric": "acc",
37
- "aggregation": "mean",
38
- "higher_is_better": true
39
- },
40
- {
41
- "metric": "acc_norm",
42
- "aggregation": "mean",
43
- "higher_is_better": true
44
- }
45
- ],
46
- "output_type": "multiple_choice",
47
- "repeats": 1,
48
- "should_decontaminate": true,
49
- "doc_to_decontamination_query": "Question",
50
- "metadata": {
51
- "version": 0.0
52
- }
53
- }
54
- },
55
- "versions": {
56
- "sdaia_mcq": 0.0
57
- },
58
- "n-shot": {
59
- "sdaia_mcq": 0
60
- },
61
- "higher_is_better": {
62
- "sdaia_mcq": {
63
- "acc": true,
64
- "acc_norm": true
65
- }
66
- },
67
- "n-samples": {
68
- "sdaia_mcq": {
69
- "original": 1594,
70
- "effective": 1594
71
- }
72
- },
73
- "config": {
74
- "model": "vllm",
75
- "model_args": "pretrained=/ALLaM-7B-Instruct,tensor_parallel_size=1,data_parallel_size=2,gpu_memory_utilization=0.8",
76
- "batch_size": 1,
77
- "batch_sizes": [],
78
- "device": null,
79
- "use_cache": null,
80
- "limit": null,
81
- "bootstrap_iters": 100000,
82
- "gen_kwargs": null,
83
- "random_seed": 0,
84
- "numpy_seed": 1234,
85
- "torch_seed": 1234,
86
- "fewshot_seed": 1234
87
- },
88
- "git_hash": "8e1bd48d",
89
- "date": 1735663470.0459642,
90
- "pretty_env_info": "PyTorch version: 2.4.0+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.3 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.27.1\nLibc version: glibc-2.35\n\nPython version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1064-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.2.128\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100 80GB PCIe\nGPU 1: NVIDIA A100 80GB PCIe\n\nNvidia driver version: 535.161.08\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.4\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.4\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 48 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 48\nOn-line CPU(s) list: 0-47\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7V13 64-Core Processor\nCPU family: 25\nModel: 1\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 1\nStepping: 1\nBogoMIPS: 4890.87\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves clzero xsaveerptr rdpru arat umip vaes vpclmulqdq rdpid fsrm\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 1.5 MiB (48 instances)\nL1i cache: 1.5 MiB (48 instances)\nL2 cache: 24 MiB (48 instances)\nL3 cache: 192 MiB (6 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-23\nNUMA node1 CPU(s): 24-47\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec rstack overflow: Mitigation; safe RET, no microcode\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] onnx==1.14.0\n[pip3] pytorch-lightning==2.0.7\n[pip3] pytorch-quantization==2.1.2\n[pip3] torch==2.4.0\n[pip3] torch-tensorrt==2.0.0.dev0\n[pip3] torchaudio==2.1.0\n[pip3] torchdata==0.7.0a0\n[pip3] torchmetrics==1.2.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] Could not collect",
91
- "transformers_version": "4.47.1",
92
- "upper_git_hash": null,
93
- "tokenizer_pad_token": [
94
- "<unk>",
95
- "0"
96
- ],
97
- "tokenizer_eos_token": [
98
- "</s>",
99
- "2"
100
- ],
101
- "tokenizer_bos_token": [
102
- "<s>",
103
- "1"
104
- ],
105
- "eot_token_id": 2,
106
- "max_length": 4096,
107
- "task_hashes": {
108
- "sdaia_mcq": "c69b252ba97d5f402c302aadb4d06d0293774e38250e701b1d0c7984fa47dd24"
109
- },
110
- "model_source": "vllm",
111
- "model_name": "/ALLaM-7B-Instruct",
112
- "model_name_sanitized": "/ALLaM-7B-Instruct",
113
- "system_instruction": null,
114
- "system_instruction_sha": null,
115
- "fewshot_as_multiturn": false,
116
- "chat_template": null,
117
- "chat_template_sha": null,
118
- "start_time": 4130.43217211,
119
- "end_time": 4204.747507708,
120
- "total_evaluation_time_seconds": "74.31533559800027"
121
- }