naazahrani commited on
Commit
dbdefd4
·
verified ·
1 Parent(s): 48ce113

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +42 -43
README.md CHANGED
@@ -127,27 +127,28 @@ All models were evaluated using our proprietary evaluation pipeline and [LM Eval
127
  The evaluation scores of ALLaM can be found in JSON format [here](https://huggingface.co/ALLaM-AI/ALLaM-7B-Instruct-preview/tree/main/evaluation).
128
 
129
 
130
- | Model | ETEC <br>0 shot | IEN-MCQ <br>0 shot | IEN-TF <br>0 shot | AraPro <br>0 shot | AraMath <br>5 shot | Ar-IFEval <br>(prompt strict) <br>0 shot | Ar-IFEval <br>(inst strict) <br>0 shot | ExamsAR <br>5 shot | ACVA <br> 5 shot | Arabic MMLU <br>0 Shot | Openai MMLU <br>0 shot | GAT <br>0 shot |AVG |
131
- |:----------------------------|:---------|:-----------------|:----------------|:----------------|:-----------------|:-----------------------------------|:---------------------------------|:------------------|:--------------|:--------------------|:--------------------|:-----------------------------|:----------------------|
132
- | ALLaM_7b-v1.27.2.25 | 66.67 | **91.77** | 82.95 | 69.71 | 66.78 | 31.34 | 67.65 | 51.58 | 76.33 | 67.78 | 55.91 | 44.53 | 64.42 |
133
- | AceGPT-v2-32B-Chat | 64.81 | 81.6 | 80.35 | 67.19 | 64.46 | 25.75 | 63.41 | 55.31 | 71.57 | 68.3 | 60.8 | 43.21 | 62.23 |
134
- | jais-family-6p7b-chat | 45.47 | 46.22 | 63.92 | 54.31 | 25.29 | 13.99 | 52.97 | 46.93 | 73.8 | 56.15 | 44.96 | 31.71 | 46.31 |
135
- | jais-family-13b-chat | 48.65 | 62.95 | 68.68 | 57.53 | 26.61 | 17.16 | 54.27 | 45.07 | 71.18 | 58.14 | 47.73 | 31.72 | 49.14 |
136
- | jais-family-30b-8k-chat | 53.52 | 72.76 | 70.65 | 61.27 | 33.39 | 16.79 | 54.68 | 50.28 | 74.47 | 63.11 | 50.9 | 36.44 | 53.19 |
137
- | jais-family-30b-16k-chat | 53.31 | 74.88 | 68.76 | 62.79 | 41.49 | 16.6 | 54.95 | 49.72 | 60.08 | 62.04 | 50.98 | 34.85 | 52.54 |
138
- | jais-adapted-7b-chat | 40.49 | 57.38 | 67.18 | 50.59 | 28.43 | 14.93 | 54.27 | 40.6 | 70.44 | 49.75 | 38.54 | 29.68 | 45.19 |
139
- | jais-adapted-13b-chat | 48.12 | 69.65 | 71.85 | 59.07 | 37.02 | 23.32 | 60.61 | 48.23 | 67.78 | 56.42 | 46.83 | 33.4 | 51.86 |
140
- | jais-adapted-70b-chat | 56.81 | 74.51 | 76.47 | 64.59 | 45.62 | 27.05 | 65.05 | 54.75 | 73.33 | 65.74 | 56.82 | 39.15 | 58.32 |
141
- | Qwen2.5-7B-Instruct | 64.12 | 66.38 | 78.46 | 64.63 | 71.74 | 28.17 | 65.19 | 50.65 | 78.17 | 61.54 | 56.1 | 41.42 | 60.55 |
142
- | Qwen2.5-14B-Instruct | 72.18 | 80.51 | 77.64 | 69.11 | 82.81 | 68.66 | 86.76 | 57.54 | 75.04 | 69.36 | 63.8 | 51.7 | 71.26 |
143
- | Qwen2.5-72B-Instruct | **78.7** | 86.88 | **86.62** | **74.69** | **92.89** | 67.72 | 87.51 | 60.71 | **79.92** | **74.1** | **73.59** | **59.54** | **76.91** |
144
- | Mistral-7B-Instruct-v0.3 | 35.67 | 53.59 | 63.4 | 43.85 | 27.11 | 30.41 | 64.03 | 34.08 | 60.25 | 45.27 | 32.3 | 26.65 | 43.05 |
145
- | Mistral-Nemo-Instruct-2407 | 49.28 | 68.43 | 71.78 | 57.61 | 40.0 | 35.82 | 70.58 | 47.49 | 76.92 | 55.97 | 46.15 | 25.44 | 53.79 |
146
- | Mistral-Small-Instruct-2409 | 40.96 | 60.64 | 63.66 | 47.73 | 44.46 | 51.12 | 78.16 | 38.73 | 68.93 | 50.43 | 39.63 | 28.82 | 51.11 |
147
- | falcon-mamba-7b-instruct | 37.52 | 52.65 | 57.63 | 41.47 | 56.53 | 8.58 | 47.92 | 28.49 | 63.52 | 39.27 | 28.45 | 29.69 | 40.98 |
148
- | Llama-3.1-8B-Instruct | 45.68 | 59.23 | 71.7 | 52.51 | 34.38 | 51.87 | 79.11 | 54.0 | 70.54 | 56.53 | 44.67 | 30.76 | 54.25 |
149
- | Llama-3.3-70B-Instruct | 68.84 | 79.6 | 78.81 | 70.49 | 70.91 | **70.9** | **88.6** | **65.74** | 76.93 | 72.01 | 70.25 | 44.12 | 71.43 |
150
- <!-- | AceGPT-v2-8B-Chat | nan | nan | nan | nan | nan | nan | nan | 51.96 | 72.69 | 57.02 | 49.99 | 36.15 | 53.56 | -->
 
151
 
152
  Closed models evaluations:
153
 
@@ -159,28 +160,26 @@ Closed models evaluations:
159
 
160
  #### English Benchmarks
161
 
162
-
163
- | model | AGIEval 0 Shot | Arc (challenge) 0 Shot | GPQA (main) 0 Shot | Hendrycks <br>ethics 0 Shot | Winogrande 0 Shot | HellaSwag 0 Shot | TriviaQa 5 Shot | MMLU Pro<br>5 Shot | Minerva Math <br>4 Shot | MMLU 0 Shot | TruthfulQA <br>(mc2) 0 Shot | IFEval <br>(prompt strict)<br>0 Shot | IFEval <br>(inst strict)<br>0 Shot | GSM8k 5 Shot |
164
- |:----------------------------------|-----------------:|-----------------------:|--------------------------:|--------------------------:|--------------------:|-------------------:|------------------:|------------------:|----------------------:|--------------:|------------------------:|----------------------------------:|--------------------------------:|---------------:|
165
- | ALLaM-7B-Instruct-preview | 41.75 | 51.28 | 22.1 | 73.17 | 70.48 | 76.26 | 15.96 | 30.43 | 17.42 | 59.6 | 46.67 | 37.71 | 48.68 | 62.02 | 85.35 | 79.11 | 47.47 | 36.73 | 69.42 | 56.35 | 58.23 | 68.35 | 81.43 |
166
- | AceGPT-v2-8B-Chat | 37.17 | 53.5 | 25.67 | 68.14 | 73.72 | 79.21 | 67.65 | 37.38 | 17.58 | 64.62 | 55.2 | 23.48 | 32.97 | 56.86 |
167
- | jais-family-6p7b-chat | 30.56 | 44.62 | 23.21 | 65.7 | 62.43 | 72.05 | 29.74 | 23.3 | 2.56 | 49.62 | 40.99 | 14.05 | 23.5 | 54.36 |
168
- | jais-family-13b-chat | 30.31 | 47.87 | 25.89 | 65.91 | 65.04 | 75 | 35.82 | 24.4 | 18.92 | 51.91 | 40.57 | 20.52 | 31.89 | 64.59 |
169
- | jais-family-30b-8k-chat | 36.65 | 48.38 | 21.88 | 69.28 | 70.32 | 78.55 | 46.67 | 28.7 | 26.46 | 57.46 | 49.49 | 23.84 | 37.41 | 72.18 |
170
- | jais-family-30b-16k-chat | 31.85 | 48.46 | 23.88 | 69.44 | 68.19 | 76.21 | 43.99 | 29.11 | 22.3 | 58.5 | 44.78 | 18.3 | 30.22 | 68.01 |
171
- | jais-adapted-7b-chat | 32.9 | 52.65 | 23.88 | 55.32 | 71.74 | 79.39 | 63.89 | 24.38 | 15.34 | 52.36 | 41.12 | 22 | 35.73 | 58.07 |
172
- | jais-adapted-13b-chat | 36.49 | 54.18 | 26.34 | 65.73 | 69.77 | 80.86 | 58.48 | 26.29 | 21.34 | 55.66 | 42.27 | 24.95 | 36.57 | 68.84 |
173
- | jais-adapted-70b-chat | 39.96 | 59.56 | 20.98 | 70.77 | 77.27 | 84.06 | 68.6 | 37.33 | 27.72 | 65.23 | 44.49 | 31.98 | 44.36 | 76.8 |
174
- | Qwen2.5-7B-Instruct | 59.1 | 51.28 | 26.56 | 72.78 | 69.38 | 79.55 | 50.59 | 36.1 | 12.04 | 70.56 | 58.93 | 58.04 | 68.35 | 43.29 |
175
- | Qwen2.5-14B-Instruct | 66.32 | 62.12 | 25.89 | 76.19 | 75.77 | 84.36 | 59.47 | 49.6 | 22.6 | 78.93 | 69.01 | 51.57 | 64.27 | 79.38 |
176
- | Qwen2.5-72B-Instruct | 71.09 | 63.48 | 25.67 | 78.33 | 76.24 | 87.41 | 70.9 | 62.77 | 54.04 | 83.44 | 69.54 | 67.47 | 76.86 | 93.25 |
177
- | Mistral-7B-Instruct-v0.3 | 36.54 | 58.87 | 23.21 | 72.53 | 73.95 | 82.93 | 67.97 | 33.61 | 13.44 | 59.74 | 59.69 | 42.51 | 54.8 | 48.37 |
178
- | Mistral-Nemo-Instruct-2407 | 39.88 | 59.13 | 24.33 | 67.82 | 74.74 | 82.35 | 72.93 | 44.27 | 30.22 | 65.56 | 54.88 | 30.13 | 38.97 | 74.15 |
179
- | Mistral-Small-Instruct-2409 | 40.76 | 60.49 | 25.89 | 72.27 | 78.53 | 85.35 | 79.11 | 47.47 | 39.42 | 69.42 | 56.35 | 58.23 | 68.35 | 81.43 |
180
- | falcon-mamba-7b-instruct | 35.57 | 57.68 | 27.46 | 67.68 | 73.4 | 78.21 | 33.1 | 24.79 | 16.82 | 60.51 | 53.19 | 28.1 | 38.61 | 56.33 |
181
- | Llama-3.1-8B-Instruct | 42.38 | 55.12 | 27.01 | 66.69 | 73.88 | 79.28 | 70.08 | 41.16 | 34.14 | 67.97 | 54.05 | 42.7 | 57.55 | 75.82 |
182
- | Llama-3.1-70B-Instruct | 52.6 | 63.05 | 27.01 | 80.28 | 79.08 | 84.67 | 82.09 | 59 | 49.18 | 82.36 | 59.92 | 70.98 | 79.74 | 88.4 |
183
-
184
 
185
  ### MT-bench
186
 
 
127
  The evaluation scores of ALLaM can be found in JSON format [here](https://huggingface.co/ALLaM-AI/ALLaM-7B-Instruct-preview/tree/main/evaluation).
128
 
129
 
130
+
131
+ | Model |AVG | ETEC <br>0 shot | IEN-MCQ <br>0 shot | IEN-TF <br>0 shot | AraPro <br>0 shot | AraMath <br>5 shot | Ar-IFEval <br>(prompt strict) <br>0 shot | Ar-IFEval <br>(inst strict) <br>0 shot | ExamsAR <br>5 shot | ACVA <br> 5 shot | Arabic MMLU <br>0 Shot | Openai MMLU <br>0 shot | GAT <br>0 shot |
132
+ |:----------------------------|:----------|:---------|:-----------------|:----------------|:----------------|:-----------------|:-----------------------------------|:---------------------------------|:------------------|:--------------|:--------------------|:--------------------|:-----------------------------|
133
+ | ALLaM-7B-Instruct-preview | 64.42 | 66.67 | **91.77** | 82.95 | 69.71 | 66.78 | 31.34 | 67.65 | 51.58 | 76.33 | 67.78 | 55.91 | 44.53 |
134
+ | AceGPT-v2-32B-Chat | 62.23 | 64.81 | 81.6 | 80.35 | 67.19 | 64.46 | 25.75 | 63.41 | 55.31 | 71.57 | 68.3 | 60.8 | 43.21 |
135
+ | jais-family-6p7b-chat | 46.31 | 45.47 | 46.22 | 63.92 | 54.31 | 25.29 | 13.99 | 52.97 | 46.93 | 73.8 | 56.15 | 44.96 | 31.71 |
136
+ | jais-family-13b-chat | 49.14 | 48.65 | 62.95 | 68.68 | 57.53 | 26.61 | 17.16 | 54.27 | 45.07 | 71.18 | 58.14 | 47.73 | 31.72 |
137
+ | jais-family-30b-16k-chat | 52.54 | 53.31 | 74.88 | 68.76 | 62.79 | 41.49 | 16.6 | 54.95 | 49.72 | 60.08 | 62.04 | 50.98 | 34.85 |
138
+ | jais-family-30b-8k-chat | 53.19 | 53.52 | 72.76 | 70.65 | 61.27 | 33.39 | 16.79 | 54.68 | 50.28 | 74.47 | 63.11 | 50.9 | 36.44 |
139
+ | jais-adapted-7b-chat | 45.19 | 40.49 | 57.38 | 67.18 | 50.59 | 28.43 | 14.93 | 54.27 | 40.6 | 70.44 | 49.75 | 38.54 | 29.68 |
140
+ | jais-adapted-13b-chat | 51.86 | 48.12 | 69.65 | 71.85 | 59.07 | 37.02 | 23.32 | 60.61 | 48.23 | 67.78 | 56.42 | 46.83 | 33.4 |
141
+ | jais-adapted-70b-chat | 58.32 | 56.81 | 74.51 | 76.47 | 64.59 | 45.62 | 27.05 | 65.05 | 54.75 | 73.33 | 65.74 | 56.82 | 39.15 |
142
+ | falcon-mamba-7b-instruct | 40.98 | 37.52 | 52.65 | 57.63 | 41.47 | 56.53 | 8.58 | 47.92 | 28.49 | 63.52 | 39.27 | 28.45 | 29.69 |
143
+ | Qwen2.5-7B-Instruct | 60.55 | 64.12 | 66.38 | 78.46 | 64.63 | 71.74 | 28.17 | 65.19 | 50.65 | 78.17 | 61.54 | 56.1 | 41.42 |
144
+ | Qwen2.5-14B-Instruct | 71.26 | 72.18 | 80.51 | 77.64 | 69.11 | 82.81 | 68.66 | 86.76 | 57.54 | 75.04 | 69.36 | 63.8 | 51.7 |
145
+ | Qwen2.5-72B-Instruct | **76.91** | **78.7** | 86.88 | **86.62** | **74.69** | **92.89** | 67.72 | 87.51 | 60.71 | **79.92** | **74.1** | **73.59** | **59.54** |
146
+ | Mistral-7B-Instruct-v0.3 | 43.05 | 35.67 | 53.59 | 63.4 | 43.85 | 27.11 | 30.41 | 64.03 | 34.08 | 60.25 | 45.27 | 32.3 | 26.65 |
147
+ | Mistral-Nemo-Instruct-2407 | 53.79 | 49.28 | 68.43 | 71.78 | 57.61 | 40.0 | 35.82 | 70.58 | 47.49 | 76.92 | 55.97 | 46.15 | 25.44 |
148
+ | Mistral-Small-Instruct-2409 | 51.11 | 40.96 | 60.64 | 63.66 | 47.73 | 44.46 | 51.12 | 78.16 | 38.73 | 68.93 | 50.43 | 39.63 | 28.82 |
149
+ | Llama-3.1-8B-Instruct | 54.25 | 45.68 | 59.23 | 71.7 | 52.51 | 34.38 | 51.87 | 79.11 | 54.0 | 70.54 | 56.53 | 44.67 | 30.76 |
150
+ | Llama-3.3-70B-Instruct | 71.43 | 68.84 | 79.6 | 78.81 | 70.49 | 70.91 | **70.9** | **88.6** | **65.74** | 76.93 | 72.01 | 70.25 | 44.12 |
151
+ <!-- | AceGPT-v2-8B-Chat | 53.56 | nan | nan | nan | nan | nan | nan | nan | 51.96 | 72.69 | 57.02 | 49.99 | 36.15 | -->
152
 
153
  Closed models evaluations:
154
 
 
160
 
161
  #### English Benchmarks
162
 
163
+ | model |Avg | AGIEval 0 Shot | Arc (challenge) 0 Shot | GPQA (main) 0 Shot | Hendrycks <br>ethics 0 Shot | Winogrande 0 Shot | HellaSwag 0 Shot | TriviaQa 5 Shot | MMLU Pro<br>5 Shot | Minerva Math <br>4 Shot | MMLU 0 Shot | TruthfulQA <br>(mc2) 0 Shot | IFEval <br>(prompt strict)<br>0 Shot | IFEval <br>(inst strict)<br>0 Shot | GSM8k 5 Shot |
164
+ |:----------------------------------|:----------|:-----------------|:-----------------------|:--------------------------|:--------------------------|:--------------------|:-------------------|:------------------|:------------------|:----------------------|:--------------|:------------------------|:---------------------------------|:-------------------------------|:---------------|
165
+ | ALLaM-7B-Instruct-preview | 46.85 | 41.99 | 51.28 | 22.77 | 73.17 | 70.48 | 76.26 | 16.07 | 30.4 | 17.3 | 59.6 | 46.67 | 38.08 | 50.0 | 61.79 |
166
+ | AceGPT-v2-8B-Chat | 49.51 | 37.17 | 53.5 | 25.67 | 68.14 | 73.72 | 79.21 | 67.65 | 37.38 | 17.58 | 64.62 | 55.2 | 23.48 | 32.97 | 56.86 |
167
+ | jais-family-6p7b-chat | 38.34 | 30.56 | 44.62 | 23.21 | 65.7 | 62.43 | 72.05 | 29.74 | 23.3 | 2.56 | 49.62 | 40.99 | 14.05 | 23.5 | 54.36 |
168
+ | jais-family-13b-chat | 42.62 | 30.31 | 47.87 | 25.89 | 65.91 | 65.04 | 75.0 | 35.82 | 24.4 | 19.1 | 51.91 | 40.57 | 19.41 | 30.82 | 64.59 |
169
+ | jais-family-30b-16k-chat | 45.15 | 31.85 | 48.46 | 23.88 | 69.44 | 68.19 | 76.21 | 43.99 | 29.11 | 22.3 | 58.5 | 44.78 | 18.3 | 29.14 | 67.93 |
170
+ | jais-family-30b-8k-chat | 47.59 | 36.65 | 48.38 | 21.88 | 69.28 | 70.32 | 78.55 | 46.67 | 28.7 | 26.44 | 57.46 | 49.49 | 22.92 | 37.05 | 72.48 |
171
+ | jais-adapted-7b-chat | 44.91 | 32.9 | 52.65 | 23.88 | 55.32 | 71.74 | 79.39 | 63.89 | 24.38 | 15.34 | 52.36 | 41.12 | 22.0 | 35.73 | 58.07 |
172
+ | jais-adapted-13b-chat | 47.7 | 36.49 | 54.18 | 26.34 | 65.73 | 69.77 | 80.86 | 58.48 | 26.29 | 21.34 | 55.66 | 42.27 | 24.95 | 36.57 | 68.84 |
173
+ | jais-adapted-70b-chat | 53.49 | 39.96 | 59.56 | 20.98 | 70.8 | 77.27 | 84.06 | 68.64 | 37.25 | 27.72 | 65.23 | 44.49 | 31.61 | 44.0 | 77.26 |
174
+ | Qwen2.5-7B-Instruct | 54.68 | 59.2 | 51.28 | 26.56 | 73.76 | 69.38 | 79.55 | 50.59 | 44.92 | 12.04 | 70.56 | 58.93 | 57.3 | 68.23 | 43.29 |
175
+ | Qwen2.5-14B-Instruct | 62.37 | 66.32 | 62.12 | 25.89 | 76.19 | 75.77 | 84.36 | 59.47 | 52.44 | 23.04 | 78.93 | 69.01 | 52.13 | 64.03 | 83.47 |
176
+ | Qwen2.5-72B-Instruct | **70.06** | **71.09** | **63.48** | 25.67 | 78.33 | 76.24 | **87.41** | 70.9 | **62.77** | **54.04** | **83.44** | **69.54** | 67.65 | 77.1 | **93.25** |
177
+ | Mistral-7B-Instruct-v0.3 | 51.98 | 36.45 | 58.87 | 23.21 | 72.58 | 73.95 | 82.93 | 67.97 | 33.18 | 13.44 | 59.74 | 59.69 | 42.51 | 54.8 | 48.37 |
178
+ | Mistral-Nemo-Instruct-2407 | 54.0 | 39.65 | 59.04 | 24.33 | 67.86 | 74.66 | 82.35 | 72.77 | 44.27 | 29.62 | 65.56 | 54.88 | 30.13 | 38.97 | 71.95 |
179
+ | Mistral-Small-Instruct-2409 | 61.65 | 40.76 | 60.49 | 25.89 | 72.27 | 78.53 | 85.35 | 79.11 | 47.47 | 39.42 | 69.42 | 56.35 | 58.23 | 68.35 | 81.43 |
180
+ | falcon-mamba-7b-instruct | 46.53 | 35.57 | 57.68 | **27.46** | 67.68 | 73.4 | 78.21 | 33.1 | 24.79 | 16.82 | 60.51 | 53.19 | 28.1 | 38.61 | 56.33 |
181
+ | Llama-3.1-70B-Instruct | 68.45 | 52.6 | 63.05 | 27.01 | **80.28** | **79.08** | 84.67 | **82.09** | 59.0 | 49.18 | 82.36 | 59.92 | **70.98** | **79.74** | 88.4 |
182
+ | Llama-3.1-8B-Instruct | 56.27 | 42.38 | 55.12 | 27.01 | 66.69 | 73.88 | 79.28 | 70.08 | 41.16 | 34.14 | 67.97 | 54.05 | 42.7 | 57.55 | 75.82 |
 
 
183
 
184
  ### MT-bench
185