David Pomerenke commited on
Commit
c5278dd
·
1 Parent(s): d1a7111

More models

Browse files
Files changed (3) hide show
  1. evals/models.py +9 -2
  2. frontend/public/results.json +125 -20
  3. results.json +668 -24
evals/models.py CHANGED
@@ -15,12 +15,19 @@ models = [
15
  "openai/gpt-4o-mini", # 0.6$/M tokens
16
  # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
17
  "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
 
 
18
  "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
 
19
  "google/gemini-2.0-flash-001", # 0.4$/M tokens
 
 
20
  # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
21
- # "deepseek/deepseek-chat", # 0.9$/M tokens
 
22
  # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
23
- "google/gemma-3-27b-it", # 0.2$/M tokens
 
24
  ]
25
  model_fast = "meta-llama/llama-3.3-70b-instruct"
26
 
 
15
  "openai/gpt-4o-mini", # 0.6$/M tokens
16
  # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
17
  "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
18
+ "meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
19
+ "meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
20
  "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
21
+ "mistralai/mistral-nemo",
22
  "google/gemini-2.0-flash-001", # 0.4$/M tokens
23
+ "google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
24
+ "google/gemma-3-27b-it", # 0.2$/M tokens
25
  # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
26
+ "qwen/qwq-32b",
27
+ # "deepseek/deepseek-chat", # 1.3$/M tokens
28
  # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
29
+ "microsoft/phi-4-multimodal-instruct",
30
+ "amazon/nova-micro-v1", # 0.09$/M tokens
31
  ]
32
  model_fast = "meta-llama/llama-3.3-70b-instruct"
33
 
frontend/public/results.json CHANGED
@@ -18,6 +18,21 @@
18
  {
19
  "rank": 2,
20
  "provider": "Google",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "model": "Gemma 3 27b It",
22
  "hf_id": "google/gemma-3-27b-it",
23
  "creation_date": "2025-03-01",
@@ -31,7 +46,52 @@
31
  "translation_chrf": 0.54
32
  },
33
  {
34
- "rank": 3,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "provider": "OpenAI",
36
  "model": "GPT 4o Mini",
37
  "hf_id": null,
@@ -46,7 +106,7 @@
46
  "translation_chrf": 0.55
47
  },
48
  {
49
- "rank": 4,
50
  "provider": "MistralAI",
51
  "model": "Mistral Small 24b Instruct 2501",
52
  "hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
@@ -61,7 +121,7 @@
61
  "translation_chrf": 0.52
62
  },
63
  {
64
- "rank": 5,
65
  "provider": "Meta Llama",
66
  "model": "Llama 3.3 70b Instruct",
67
  "hf_id": "meta-llama/Llama-3.3-70B-Instruct",
@@ -74,6 +134,51 @@
74
  "language_modeling_chrf": 0.94,
75
  "translation_bleu": 0.31,
76
  "translation_chrf": 0.48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
78
  ],
79
  "language_table": [
@@ -81,25 +186,13 @@
81
  "language_name": "English",
82
  "speakers": 1636485840,
83
  "family": "Indo-European",
84
- "average": 0.51,
85
  "in_benchmark": true,
86
  "NaN": 0.0,
87
- "classification_accuracy": 0.65,
88
- "language_modeling_chrf": 0.94,
89
- "translation_bleu": 0.43,
90
- "translation_chrf": 0.55
91
- },
92
- {
93
- "language_name": "Chinese",
94
- "speakers": 1304678914,
95
- "family": "Sino-Tibetan",
96
- "average": 0.5,
97
- "in_benchmark": true,
98
- "NaN": 0.0,
99
- "classification_accuracy": 0.65,
100
- "language_modeling_chrf": 0.93,
101
- "translation_bleu": 0.38,
102
- "translation_chrf": 0.55
103
  },
104
  {
105
  "language_name": "French",
@@ -113,6 +206,18 @@
113
  "translation_bleu": 0.32,
114
  "translation_chrf": 0.49
115
  },
 
 
 
 
 
 
 
 
 
 
 
 
116
  {
117
  "language_name": "Hindi",
118
  "speakers": 546882144,
 
18
  {
19
  "rank": 2,
20
  "provider": "Google",
21
+ "model": "Gemini 2.0 Flash Lite 001",
22
+ "hf_id": null,
23
+ "creation_date": null,
24
+ "size": null,
25
+ "type": "Commercial",
26
+ "license": null,
27
+ "average": 0.66,
28
+ "classification_accuracy": 0.73,
29
+ "language_modeling_chrf": 0.97,
30
+ "translation_bleu": 0.4,
31
+ "translation_chrf": 0.54
32
+ },
33
+ {
34
+ "rank": 3,
35
+ "provider": "Google",
36
  "model": "Gemma 3 27b It",
37
  "hf_id": "google/gemma-3-27b-it",
38
  "creation_date": "2025-03-01",
 
46
  "translation_chrf": 0.54
47
  },
48
  {
49
+ "rank": 4,
50
+ "provider": "Meta Llama",
51
+ "model": "Llama 3.1 70b Instruct",
52
+ "hf_id": "meta-llama/Llama-3.1-70B-Instruct",
53
+ "creation_date": "2024-07-16",
54
+ "size": 70553706496.0,
55
+ "type": "Open",
56
+ "license": "Llama3.1",
57
+ "average": 0.62,
58
+ "classification_accuracy": 0.57,
59
+ "language_modeling_chrf": 0.92,
60
+ "translation_bleu": 0.43,
61
+ "translation_chrf": 0.57
62
+ },
63
+ {
64
+ "rank": 5,
65
+ "provider": "Amazon",
66
+ "model": "Nova Micro V1",
67
+ "hf_id": null,
68
+ "creation_date": null,
69
+ "size": null,
70
+ "type": "Commercial",
71
+ "license": null,
72
+ "average": 0.61,
73
+ "classification_accuracy": 0.52,
74
+ "language_modeling_chrf": 0.94,
75
+ "translation_bleu": 0.4,
76
+ "translation_chrf": 0.56
77
+ },
78
+ {
79
+ "rank": 6,
80
+ "provider": "Meta Llama",
81
+ "model": "Llama 3 70b Instruct",
82
+ "hf_id": null,
83
+ "creation_date": null,
84
+ "size": null,
85
+ "type": "Commercial",
86
+ "license": null,
87
+ "average": 0.61,
88
+ "classification_accuracy": 0.8,
89
+ "language_modeling_chrf": 0.95,
90
+ "translation_bleu": 0.25,
91
+ "translation_chrf": 0.43
92
+ },
93
+ {
94
+ "rank": 7,
95
  "provider": "OpenAI",
96
  "model": "GPT 4o Mini",
97
  "hf_id": null,
 
106
  "translation_chrf": 0.55
107
  },
108
  {
109
+ "rank": 8,
110
  "provider": "MistralAI",
111
  "model": "Mistral Small 24b Instruct 2501",
112
  "hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
 
121
  "translation_chrf": 0.52
122
  },
123
  {
124
+ "rank": 9,
125
  "provider": "Meta Llama",
126
  "model": "Llama 3.3 70b Instruct",
127
  "hf_id": "meta-llama/Llama-3.3-70B-Instruct",
 
134
  "language_modeling_chrf": 0.94,
135
  "translation_bleu": 0.31,
136
  "translation_chrf": 0.48
137
+ },
138
+ {
139
+ "rank": 10,
140
+ "provider": "MistralAI",
141
+ "model": "Mistral Nemo",
142
+ "hf_id": null,
143
+ "creation_date": null,
144
+ "size": null,
145
+ "type": "Commercial",
146
+ "license": null,
147
+ "average": 0.55,
148
+ "classification_accuracy": 0.5,
149
+ "language_modeling_chrf": 0.88,
150
+ "translation_bleu": 0.32,
151
+ "translation_chrf": 0.49
152
+ },
153
+ {
154
+ "rank": 11,
155
+ "provider": "Microsoft",
156
+ "model": "Phi 4 Multimodal Instruct",
157
+ "hf_id": "microsoft/Phi-4-multimodal-instruct",
158
+ "creation_date": "2025-02-24",
159
+ "size": 5574460384.0,
160
+ "type": "Open",
161
+ "license": "Mit",
162
+ "average": 0.52,
163
+ "classification_accuracy": 0.42,
164
+ "language_modeling_chrf": 0.87,
165
+ "translation_bleu": 0.32,
166
+ "translation_chrf": 0.46
167
+ },
168
+ {
169
+ "rank": 12,
170
+ "provider": "Qwen",
171
+ "model": "Qwq 32b",
172
+ "hf_id": "Qwen/QwQ-32B",
173
+ "creation_date": "2025-03-05",
174
+ "size": 32763876352.0,
175
+ "type": "Open",
176
+ "license": "Apache 2.0",
177
+ "average": 0.25,
178
+ "classification_accuracy": 0.0,
179
+ "language_modeling_chrf": 0.48,
180
+ "translation_bleu": 0.21,
181
+ "translation_chrf": 0.3
182
  }
183
  ],
184
  "language_table": [
 
186
  "language_name": "English",
187
  "speakers": 1636485840,
188
  "family": "Indo-European",
189
+ "average": 0.47,
190
  "in_benchmark": true,
191
  "NaN": 0.0,
192
+ "classification_accuracy": 0.58,
193
+ "language_modeling_chrf": 0.92,
194
+ "translation_bleu": 0.37,
195
+ "translation_chrf": 0.49
 
 
 
 
 
 
 
 
 
 
 
 
196
  },
197
  {
198
  "language_name": "French",
 
206
  "translation_bleu": 0.32,
207
  "translation_chrf": 0.49
208
  },
209
+ {
210
+ "language_name": "Chinese",
211
+ "speakers": 1304678914,
212
+ "family": "Sino-Tibetan",
213
+ "average": 0.46,
214
+ "in_benchmark": true,
215
+ "NaN": 0.0,
216
+ "classification_accuracy": 0.55,
217
+ "language_modeling_chrf": 0.86,
218
+ "translation_bleu": 0.35,
219
+ "translation_chrf": 0.53
220
+ },
221
  {
222
  "language_name": "Hindi",
223
  "speakers": 546882144,
results.json CHANGED
@@ -3,33 +3,61 @@
3
  {
4
  "task": "classification",
5
  "metric": "accuracy",
6
- "score": 0.5722222222222223,
7
  "bcp_47": 10,
8
- "model": 5
9
  },
10
  {
11
  "task": "language_modeling",
12
  "metric": "chrf",
13
- "score": 0.9360730990265229,
14
  "bcp_47": 10,
15
- "model": 5
16
  },
17
  {
18
  "task": "translation",
19
  "metric": "bleu",
20
- "score": 0.3508793079301233,
21
  "bcp_47": 10,
22
- "model": 5
23
  },
24
  {
25
  "task": "translation",
26
  "metric": "chrf",
27
- "score": 0.5117214627054559,
28
  "bcp_47": 10,
29
- "model": 5
30
  }
31
  ],
32
  "models": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  {
34
  "model": "google/gemini-2.0-flash-001",
35
  "task": "classification",
@@ -58,6 +86,34 @@
58
  "score": 0.5828490054615683,
59
  "bcp_47": 2
60
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  {
62
  "model": "google/gemma-3-27b-it",
63
  "task": "classification",
@@ -86,6 +142,62 @@
86
  "score": 0.5376336154503363,
87
  "bcp_47": 2
88
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  {
90
  "model": "meta-llama/llama-3.3-70b-instruct",
91
  "task": "classification",
@@ -114,6 +226,62 @@
114
  "score": 0.4836914110309717,
115
  "bcp_47": 10
116
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  {
118
  "model": "mistralai/mistral-small-24b-instruct-2501",
119
  "task": "classification",
@@ -169,6 +337,34 @@
169
  "metric": "chrf",
170
  "score": 0.5452510379336759,
171
  "bcp_47": 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  }
173
  ],
174
  "languages": [
@@ -2554,8 +2750,8 @@
2554
  "in_benchmark": true,
2555
  "task": "classification",
2556
  "metric": "accuracy",
2557
- "score": 0.6466666666666667,
2558
- "model": 5.0
2559
  },
2560
  {
2561
  "bcp_47": "en",
@@ -2569,8 +2765,8 @@
2569
  "in_benchmark": true,
2570
  "task": "language_modeling",
2571
  "metric": "chrf",
2572
- "score": 0.9391757502550891,
2573
- "model": 5.0
2574
  },
2575
  {
2576
  "bcp_47": "en",
@@ -2584,8 +2780,8 @@
2584
  "in_benchmark": true,
2585
  "task": "translation",
2586
  "metric": "bleu",
2587
- "score": 0.4274883186793429,
2588
- "model": 5.0
2589
  },
2590
  {
2591
  "bcp_47": "en",
@@ -2599,8 +2795,8 @@
2599
  "in_benchmark": true,
2600
  "task": "translation",
2601
  "metric": "chrf",
2602
- "score": 0.5456627250056182,
2603
- "model": 5.0
2604
  },
2605
  {
2606
  "bcp_47": "eo",
@@ -10699,8 +10895,8 @@
10699
  "in_benchmark": true,
10700
  "task": "classification",
10701
  "metric": "accuracy",
10702
- "score": 0.6466666666666667,
10703
- "model": 5.0
10704
  },
10705
  {
10706
  "bcp_47": "zh",
@@ -10714,8 +10910,8 @@
10714
  "in_benchmark": true,
10715
  "task": "language_modeling",
10716
  "metric": "chrf",
10717
- "score": 0.93253470927813,
10718
- "model": 5.0
10719
  },
10720
  {
10721
  "bcp_47": "zh",
@@ -10729,8 +10925,8 @@
10729
  "in_benchmark": true,
10730
  "task": "translation",
10731
  "metric": "bleu",
10732
- "score": 0.37546421356438325,
10733
- "model": 5.0
10734
  },
10735
  {
10736
  "bcp_47": "zh",
@@ -10744,8 +10940,8 @@
10744
  "in_benchmark": true,
10745
  "task": "translation",
10746
  "metric": "chrf",
10747
- "score": 0.553563744610482,
10748
- "model": 5.0
10749
  },
10750
  {
10751
  "bcp_47": "zmi",
@@ -10794,6 +10990,70 @@
10794
  }
10795
  ],
10796
  "scores": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10797
  {
10798
  "model": "google/gemini-2.0-flash-001",
10799
  "bcp_47": "en",
@@ -10858,6 +11118,70 @@
10858
  "score": 0.5606266861920302,
10859
  "sentence_nr": 14.5
10860
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10861
  {
10862
  "model": "google/gemma-3-27b-it",
10863
  "bcp_47": "en",
@@ -10922,6 +11246,134 @@
10922
  "score": 0.520771580386218,
10923
  "sentence_nr": 14.5
10924
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10925
  {
10926
  "model": "meta-llama/llama-3.3-70b-instruct",
10927
  "bcp_47": "ar",
@@ -11242,6 +11694,134 @@
11242
  "score": 0.5862284100611604,
11243
  "sentence_nr": 14.5
11244
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11245
  {
11246
  "model": "mistralai/mistral-small-24b-instruct-2501",
11247
  "bcp_47": "en",
@@ -11369,6 +11949,70 @@
11369
  "metric": "chrf",
11370
  "score": 0.559410465345808,
11371
  "sentence_nr": 14.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11372
  }
11373
  ]
11374
  }
 
3
  {
4
  "task": "classification",
5
  "metric": "accuracy",
6
+ "score": 0.5427083333333333,
7
  "bcp_47": 10,
8
+ "model": 12
9
  },
10
  {
11
  "task": "language_modeling",
12
  "metric": "chrf",
13
+ "score": 0.9024222998985517,
14
  "bcp_47": 10,
15
+ "model": 12
16
  },
17
  {
18
  "task": "translation",
19
  "metric": "bleu",
20
+ "score": 0.34325329881872996,
21
  "bcp_47": 10,
22
+ "model": 12
23
  },
24
  {
25
  "task": "translation",
26
  "metric": "chrf",
27
+ "score": 0.49764810942023735,
28
  "bcp_47": 10,
29
+ "model": 12
30
  }
31
  ],
32
  "models": [
33
+ {
34
+ "model": "amazon/nova-micro-v1",
35
+ "task": "classification",
36
+ "metric": "accuracy",
37
+ "score": 0.5166666666666666,
38
+ "bcp_47": 2
39
+ },
40
+ {
41
+ "model": "amazon/nova-micro-v1",
42
+ "task": "language_modeling",
43
+ "metric": "chrf",
44
+ "score": 0.9446198732700857,
45
+ "bcp_47": 2
46
+ },
47
+ {
48
+ "model": "amazon/nova-micro-v1",
49
+ "task": "translation",
50
+ "metric": "bleu",
51
+ "score": 0.40042093531509637,
52
+ "bcp_47": 2
53
+ },
54
+ {
55
+ "model": "amazon/nova-micro-v1",
56
+ "task": "translation",
57
+ "metric": "chrf",
58
+ "score": 0.5642142196700637,
59
+ "bcp_47": 2
60
+ },
61
  {
62
  "model": "google/gemini-2.0-flash-001",
63
  "task": "classification",
 
86
  "score": 0.5828490054615683,
87
  "bcp_47": 2
88
  },
89
+ {
90
+ "model": "google/gemini-2.0-flash-lite-001",
91
+ "task": "classification",
92
+ "metric": "accuracy",
93
+ "score": 0.7333333333333333,
94
+ "bcp_47": 2
95
+ },
96
+ {
97
+ "model": "google/gemini-2.0-flash-lite-001",
98
+ "task": "language_modeling",
99
+ "metric": "chrf",
100
+ "score": 0.9710194350890375,
101
+ "bcp_47": 2
102
+ },
103
+ {
104
+ "model": "google/gemini-2.0-flash-lite-001",
105
+ "task": "translation",
106
+ "metric": "bleu",
107
+ "score": 0.40085159165111883,
108
+ "bcp_47": 2
109
+ },
110
+ {
111
+ "model": "google/gemini-2.0-flash-lite-001",
112
+ "task": "translation",
113
+ "metric": "chrf",
114
+ "score": 0.5422821788946908,
115
+ "bcp_47": 2
116
+ },
117
  {
118
  "model": "google/gemma-3-27b-it",
119
  "task": "classification",
 
142
  "score": 0.5376336154503363,
143
  "bcp_47": 2
144
  },
145
+ {
146
+ "model": "meta-llama/llama-3-70b-instruct",
147
+ "task": "classification",
148
+ "metric": "accuracy",
149
+ "score": 0.8,
150
+ "bcp_47": 2
151
+ },
152
+ {
153
+ "model": "meta-llama/llama-3-70b-instruct",
154
+ "task": "language_modeling",
155
+ "metric": "chrf",
156
+ "score": 0.9452435586756014,
157
+ "bcp_47": 2
158
+ },
159
+ {
160
+ "model": "meta-llama/llama-3-70b-instruct",
161
+ "task": "translation",
162
+ "metric": "bleu",
163
+ "score": 0.25148401884229143,
164
+ "bcp_47": 2
165
+ },
166
+ {
167
+ "model": "meta-llama/llama-3-70b-instruct",
168
+ "task": "translation",
169
+ "metric": "chrf",
170
+ "score": 0.4285750600098188,
171
+ "bcp_47": 2
172
+ },
173
+ {
174
+ "model": "meta-llama/llama-3.1-70b-instruct",
175
+ "task": "classification",
176
+ "metric": "accuracy",
177
+ "score": 0.5666666666666667,
178
+ "bcp_47": 2
179
+ },
180
+ {
181
+ "model": "meta-llama/llama-3.1-70b-instruct",
182
+ "task": "language_modeling",
183
+ "metric": "chrf",
184
+ "score": 0.9203465184571391,
185
+ "bcp_47": 2
186
+ },
187
+ {
188
+ "model": "meta-llama/llama-3.1-70b-instruct",
189
+ "task": "translation",
190
+ "metric": "bleu",
191
+ "score": 0.43182300663190504,
192
+ "bcp_47": 2
193
+ },
194
+ {
195
+ "model": "meta-llama/llama-3.1-70b-instruct",
196
+ "task": "translation",
197
+ "metric": "chrf",
198
+ "score": 0.5679592059634284,
199
+ "bcp_47": 2
200
+ },
201
  {
202
  "model": "meta-llama/llama-3.3-70b-instruct",
203
  "task": "classification",
 
226
  "score": 0.4836914110309717,
227
  "bcp_47": 10
228
  },
229
+ {
230
+ "model": "microsoft/phi-4-multimodal-instruct",
231
+ "task": "classification",
232
+ "metric": "accuracy",
233
+ "score": 0.4166666666666667,
234
+ "bcp_47": 2
235
+ },
236
+ {
237
+ "model": "microsoft/phi-4-multimodal-instruct",
238
+ "task": "language_modeling",
239
+ "metric": "chrf",
240
+ "score": 0.8700000415175042,
241
+ "bcp_47": 2
242
+ },
243
+ {
244
+ "model": "microsoft/phi-4-multimodal-instruct",
245
+ "task": "translation",
246
+ "metric": "bleu",
247
+ "score": 0.31733056990581465,
248
+ "bcp_47": 2
249
+ },
250
+ {
251
+ "model": "microsoft/phi-4-multimodal-instruct",
252
+ "task": "translation",
253
+ "metric": "chrf",
254
+ "score": 0.45631576469060464,
255
+ "bcp_47": 2
256
+ },
257
+ {
258
+ "model": "mistralai/mistral-nemo",
259
+ "task": "classification",
260
+ "metric": "accuracy",
261
+ "score": 0.5,
262
+ "bcp_47": 2
263
+ },
264
+ {
265
+ "model": "mistralai/mistral-nemo",
266
+ "task": "language_modeling",
267
+ "metric": "chrf",
268
+ "score": 0.8815544644693022,
269
+ "bcp_47": 2
270
+ },
271
+ {
272
+ "model": "mistralai/mistral-nemo",
273
+ "task": "translation",
274
+ "metric": "bleu",
275
+ "score": 0.3177444138044378,
276
+ "bcp_47": 2
277
+ },
278
+ {
279
+ "model": "mistralai/mistral-nemo",
280
+ "task": "translation",
281
+ "metric": "chrf",
282
+ "score": 0.49319228717306784,
283
+ "bcp_47": 2
284
+ },
285
  {
286
  "model": "mistralai/mistral-small-24b-instruct-2501",
287
  "task": "classification",
 
337
  "metric": "chrf",
338
  "score": 0.5452510379336759,
339
  "bcp_47": 2
340
+ },
341
+ {
342
+ "model": "qwen/qwq-32b",
343
+ "task": "classification",
344
+ "metric": "accuracy",
345
+ "score": 0.0,
346
+ "bcp_47": 2
347
+ },
348
+ {
349
+ "model": "qwen/qwq-32b",
350
+ "task": "language_modeling",
351
+ "metric": "chrf",
352
+ "score": 0.4813150156594517,
353
+ "bcp_47": 2
354
+ },
355
+ {
356
+ "model": "qwen/qwq-32b",
357
+ "task": "translation",
358
+ "metric": "bleu",
359
+ "score": 0.2144844735779058,
360
+ "bcp_47": 2
361
+ },
362
+ {
363
+ "model": "qwen/qwq-32b",
364
+ "task": "translation",
365
+ "metric": "chrf",
366
+ "score": 0.30433786997302065,
367
+ "bcp_47": 2
368
  }
369
  ],
370
  "languages": [
 
2750
  "in_benchmark": true,
2751
  "task": "classification",
2752
  "metric": "accuracy",
2753
+ "score": 0.5777777777777778,
2754
+ "model": 12.0
2755
  },
2756
  {
2757
  "bcp_47": "en",
 
2765
  "in_benchmark": true,
2766
  "task": "language_modeling",
2767
  "metric": "chrf",
2768
+ "score": 0.9222343234934963,
2769
+ "model": 12.0
2770
  },
2771
  {
2772
  "bcp_47": "en",
 
2780
  "in_benchmark": true,
2781
  "task": "translation",
2782
  "metric": "bleu",
2783
+ "score": 0.37035746903842287,
2784
+ "model": 12.0
2785
  },
2786
  {
2787
  "bcp_47": "en",
 
2795
  "in_benchmark": true,
2796
  "task": "translation",
2797
  "metric": "chrf",
2798
+ "score": 0.4880916692700535,
2799
+ "model": 12.0
2800
  },
2801
  {
2802
  "bcp_47": "eo",
 
10895
  "in_benchmark": true,
10896
  "task": "classification",
10897
  "metric": "accuracy",
10898
+ "score": 0.5499999999999999,
10899
+ "model": 12.0
10900
  },
10901
  {
10902
  "bcp_47": "zh",
 
10910
  "in_benchmark": true,
10911
  "task": "language_modeling",
10912
  "metric": "chrf",
10913
+ "score": 0.8599948525016986,
10914
+ "model": 12.0
10915
  },
10916
  {
10917
  "bcp_47": "zh",
 
10925
  "in_benchmark": true,
10926
  "task": "translation",
10927
  "metric": "bleu",
10928
+ "score": 0.3532292543512247,
10929
+ "model": 12.0
10930
  },
10931
  {
10932
  "bcp_47": "zh",
 
10940
  "in_benchmark": true,
10941
  "task": "translation",
10942
  "metric": "chrf",
10943
+ "score": 0.529398790799104,
10944
+ "model": 12.0
10945
  },
10946
  {
10947
  "bcp_47": "zmi",
 
10990
  }
10991
  ],
10992
  "scores": [
10993
+ {
10994
+ "model": "amazon/nova-micro-v1",
10995
+ "bcp_47": "en",
10996
+ "task": "classification",
10997
+ "metric": "accuracy",
10998
+ "score": 0.5333333333333333,
10999
+ "sentence_nr": 14.5
11000
+ },
11001
+ {
11002
+ "model": "amazon/nova-micro-v1",
11003
+ "bcp_47": "en",
11004
+ "task": "language_modeling",
11005
+ "metric": "chrf",
11006
+ "score": 0.9725001956658679,
11007
+ "sentence_nr": 14.5
11008
+ },
11009
+ {
11010
+ "model": "amazon/nova-micro-v1",
11011
+ "bcp_47": "en",
11012
+ "task": "translation",
11013
+ "metric": "bleu",
11014
+ "score": 0.4491277841667736,
11015
+ "sentence_nr": 14.5
11016
+ },
11017
+ {
11018
+ "model": "amazon/nova-micro-v1",
11019
+ "bcp_47": "en",
11020
+ "task": "translation",
11021
+ "metric": "chrf",
11022
+ "score": 0.5740458676508566,
11023
+ "sentence_nr": 14.5
11024
+ },
11025
+ {
11026
+ "model": "amazon/nova-micro-v1",
11027
+ "bcp_47": "zh",
11028
+ "task": "classification",
11029
+ "metric": "accuracy",
11030
+ "score": 0.5,
11031
+ "sentence_nr": 14.5
11032
+ },
11033
+ {
11034
+ "model": "amazon/nova-micro-v1",
11035
+ "bcp_47": "zh",
11036
+ "task": "language_modeling",
11037
+ "metric": "chrf",
11038
+ "score": 0.9167395508743035,
11039
+ "sentence_nr": 14.5
11040
+ },
11041
+ {
11042
+ "model": "amazon/nova-micro-v1",
11043
+ "bcp_47": "zh",
11044
+ "task": "translation",
11045
+ "metric": "bleu",
11046
+ "score": 0.3517140864634192,
11047
+ "sentence_nr": 14.5
11048
+ },
11049
+ {
11050
+ "model": "amazon/nova-micro-v1",
11051
+ "bcp_47": "zh",
11052
+ "task": "translation",
11053
+ "metric": "chrf",
11054
+ "score": 0.5543825716892707,
11055
+ "sentence_nr": 14.5
11056
+ },
11057
  {
11058
  "model": "google/gemini-2.0-flash-001",
11059
  "bcp_47": "en",
 
11118
  "score": 0.5606266861920302,
11119
  "sentence_nr": 14.5
11120
  },
11121
+ {
11122
+ "model": "google/gemini-2.0-flash-lite-001",
11123
+ "bcp_47": "en",
11124
+ "task": "classification",
11125
+ "metric": "accuracy",
11126
+ "score": 0.7333333333333333,
11127
+ "sentence_nr": 14.5
11128
+ },
11129
+ {
11130
+ "model": "google/gemini-2.0-flash-lite-001",
11131
+ "bcp_47": "en",
11132
+ "task": "language_modeling",
11133
+ "metric": "chrf",
11134
+ "score": 0.990925430282282,
11135
+ "sentence_nr": 14.5
11136
+ },
11137
+ {
11138
+ "model": "google/gemini-2.0-flash-lite-001",
11139
+ "bcp_47": "en",
11140
+ "task": "translation",
11141
+ "metric": "bleu",
11142
+ "score": 0.37911136698810943,
11143
+ "sentence_nr": 14.5
11144
+ },
11145
+ {
11146
+ "model": "google/gemini-2.0-flash-lite-001",
11147
+ "bcp_47": "en",
11148
+ "task": "translation",
11149
+ "metric": "chrf",
11150
+ "score": 0.5094402087357145,
11151
+ "sentence_nr": 14.5
11152
+ },
11153
+ {
11154
+ "model": "google/gemini-2.0-flash-lite-001",
11155
+ "bcp_47": "zh",
11156
+ "task": "classification",
11157
+ "metric": "accuracy",
11158
+ "score": 0.7333333333333333,
11159
+ "sentence_nr": 14.5
11160
+ },
11161
+ {
11162
+ "model": "google/gemini-2.0-flash-lite-001",
11163
+ "bcp_47": "zh",
11164
+ "task": "language_modeling",
11165
+ "metric": "chrf",
11166
+ "score": 0.9511134398957932,
11167
+ "sentence_nr": 14.5
11168
+ },
11169
+ {
11170
+ "model": "google/gemini-2.0-flash-lite-001",
11171
+ "bcp_47": "zh",
11172
+ "task": "translation",
11173
+ "metric": "bleu",
11174
+ "score": 0.4225918163141283,
11175
+ "sentence_nr": 14.5
11176
+ },
11177
+ {
11178
+ "model": "google/gemini-2.0-flash-lite-001",
11179
+ "bcp_47": "zh",
11180
+ "task": "translation",
11181
+ "metric": "chrf",
11182
+ "score": 0.5751241490536672,
11183
+ "sentence_nr": 14.5
11184
+ },
11185
  {
11186
  "model": "google/gemma-3-27b-it",
11187
  "bcp_47": "en",
 
11246
  "score": 0.520771580386218,
11247
  "sentence_nr": 14.5
11248
  },
11249
+ {
11250
+ "model": "meta-llama/llama-3-70b-instruct",
11251
+ "bcp_47": "en",
11252
+ "task": "classification",
11253
+ "metric": "accuracy",
11254
+ "score": 0.8333333333333334,
11255
+ "sentence_nr": 14.5
11256
+ },
11257
+ {
11258
+ "model": "meta-llama/llama-3-70b-instruct",
11259
+ "bcp_47": "en",
11260
+ "task": "language_modeling",
11261
+ "metric": "chrf",
11262
+ "score": 0.9674315682816375,
11263
+ "sentence_nr": 14.5
11264
+ },
11265
+ {
11266
+ "model": "meta-llama/llama-3-70b-instruct",
11267
+ "bcp_47": "en",
11268
+ "task": "translation",
11269
+ "metric": "bleu",
11270
+ "score": 0.18722412351358647,
11271
+ "sentence_nr": 14.5
11272
+ },
11273
+ {
11274
+ "model": "meta-llama/llama-3-70b-instruct",
11275
+ "bcp_47": "en",
11276
+ "task": "translation",
11277
+ "metric": "chrf",
11278
+ "score": 0.34151371128305424,
11279
+ "sentence_nr": 14.5
11280
+ },
11281
+ {
11282
+ "model": "meta-llama/llama-3-70b-instruct",
11283
+ "bcp_47": "zh",
11284
+ "task": "classification",
11285
+ "metric": "accuracy",
11286
+ "score": 0.7666666666666667,
11287
+ "sentence_nr": 14.5
11288
+ },
11289
+ {
11290
+ "model": "meta-llama/llama-3-70b-instruct",
11291
+ "bcp_47": "zh",
11292
+ "task": "language_modeling",
11293
+ "metric": "chrf",
11294
+ "score": 0.9230555490695652,
11295
+ "sentence_nr": 14.5
11296
+ },
11297
+ {
11298
+ "model": "meta-llama/llama-3-70b-instruct",
11299
+ "bcp_47": "zh",
11300
+ "task": "translation",
11301
+ "metric": "bleu",
11302
+ "score": 0.3157439141709964,
11303
+ "sentence_nr": 14.5
11304
+ },
11305
+ {
11306
+ "model": "meta-llama/llama-3-70b-instruct",
11307
+ "bcp_47": "zh",
11308
+ "task": "translation",
11309
+ "metric": "chrf",
11310
+ "score": 0.5156364087365835,
11311
+ "sentence_nr": 14.5
11312
+ },
11313
+ {
11314
+ "model": "meta-llama/llama-3.1-70b-instruct",
11315
+ "bcp_47": "en",
11316
+ "task": "classification",
11317
+ "metric": "accuracy",
11318
+ "score": 0.7,
11319
+ "sentence_nr": 14.5
11320
+ },
11321
+ {
11322
+ "model": "meta-llama/llama-3.1-70b-instruct",
11323
+ "bcp_47": "en",
11324
+ "task": "language_modeling",
11325
+ "metric": "chrf",
11326
+ "score": 0.9701295103188484,
11327
+ "sentence_nr": 14.5
11328
+ },
11329
+ {
11330
+ "model": "meta-llama/llama-3.1-70b-instruct",
11331
+ "bcp_47": "en",
11332
+ "task": "translation",
11333
+ "metric": "bleu",
11334
+ "score": 0.44443705644214526,
11335
+ "sentence_nr": 14.5
11336
+ },
11337
+ {
11338
+ "model": "meta-llama/llama-3.1-70b-instruct",
11339
+ "bcp_47": "en",
11340
+ "task": "translation",
11341
+ "metric": "chrf",
11342
+ "score": 0.5485685299214524,
11343
+ "sentence_nr": 14.5
11344
+ },
11345
+ {
11346
+ "model": "meta-llama/llama-3.1-70b-instruct",
11347
+ "bcp_47": "zh",
11348
+ "task": "classification",
11349
+ "metric": "accuracy",
11350
+ "score": 0.43333333333333335,
11351
+ "sentence_nr": 14.5
11352
+ },
11353
+ {
11354
+ "model": "meta-llama/llama-3.1-70b-instruct",
11355
+ "bcp_47": "zh",
11356
+ "task": "language_modeling",
11357
+ "metric": "chrf",
11358
+ "score": 0.8705635265954298,
11359
+ "sentence_nr": 14.5
11360
+ },
11361
+ {
11362
+ "model": "meta-llama/llama-3.1-70b-instruct",
11363
+ "bcp_47": "zh",
11364
+ "task": "translation",
11365
+ "metric": "bleu",
11366
+ "score": 0.4192089568216648,
11367
+ "sentence_nr": 14.5
11368
+ },
11369
+ {
11370
+ "model": "meta-llama/llama-3.1-70b-instruct",
11371
+ "bcp_47": "zh",
11372
+ "task": "translation",
11373
+ "metric": "chrf",
11374
+ "score": 0.5873498820054043,
11375
+ "sentence_nr": 14.5
11376
+ },
11377
  {
11378
  "model": "meta-llama/llama-3.3-70b-instruct",
11379
  "bcp_47": "ar",
 
11694
  "score": 0.5862284100611604,
11695
  "sentence_nr": 14.5
11696
  },
11697
+ {
11698
+ "model": "microsoft/phi-4-multimodal-instruct",
11699
+ "bcp_47": "en",
11700
+ "task": "classification",
11701
+ "metric": "accuracy",
11702
+ "score": 0.43333333333333335,
11703
+ "sentence_nr": 14.5
11704
+ },
11705
+ {
11706
+ "model": "microsoft/phi-4-multimodal-instruct",
11707
+ "bcp_47": "en",
11708
+ "task": "language_modeling",
11709
+ "metric": "chrf",
11710
+ "score": 0.9268050965065061,
11711
+ "sentence_nr": 14.5
11712
+ },
11713
+ {
11714
+ "model": "microsoft/phi-4-multimodal-instruct",
11715
+ "bcp_47": "en",
11716
+ "task": "translation",
11717
+ "metric": "bleu",
11718
+ "score": 0.34049537977839345,
11719
+ "sentence_nr": 14.5
11720
+ },
11721
+ {
11722
+ "model": "microsoft/phi-4-multimodal-instruct",
11723
+ "bcp_47": "en",
11724
+ "task": "translation",
11725
+ "metric": "chrf",
11726
+ "score": 0.4566714452688056,
11727
+ "sentence_nr": 14.5
11728
+ },
11729
+ {
11730
+ "model": "microsoft/phi-4-multimodal-instruct",
11731
+ "bcp_47": "zh",
11732
+ "task": "classification",
11733
+ "metric": "accuracy",
11734
+ "score": 0.4,
11735
+ "sentence_nr": 14.5
11736
+ },
11737
+ {
11738
+ "model": "microsoft/phi-4-multimodal-instruct",
11739
+ "bcp_47": "zh",
11740
+ "task": "language_modeling",
11741
+ "metric": "chrf",
11742
+ "score": 0.8131949865285024,
11743
+ "sentence_nr": 14.5
11744
+ },
11745
+ {
11746
+ "model": "microsoft/phi-4-multimodal-instruct",
11747
+ "bcp_47": "zh",
11748
+ "task": "translation",
11749
+ "metric": "bleu",
11750
+ "score": 0.2941657600332359,
11751
+ "sentence_nr": 14.5
11752
+ },
11753
+ {
11754
+ "model": "microsoft/phi-4-multimodal-instruct",
11755
+ "bcp_47": "zh",
11756
+ "task": "translation",
11757
+ "metric": "chrf",
11758
+ "score": 0.4559600841124037,
11759
+ "sentence_nr": 14.5
11760
+ },
11761
+ {
11762
+ "model": "mistralai/mistral-nemo",
11763
+ "bcp_47": "en",
11764
+ "task": "classification",
11765
+ "metric": "accuracy",
11766
+ "score": 0.4666666666666667,
11767
+ "sentence_nr": 14.5
11768
+ },
11769
+ {
11770
+ "model": "mistralai/mistral-nemo",
11771
+ "bcp_47": "en",
11772
+ "task": "language_modeling",
11773
+ "metric": "chrf",
11774
+ "score": 0.9383955895073849,
11775
+ "sentence_nr": 14.5
11776
+ },
11777
+ {
11778
+ "model": "mistralai/mistral-nemo",
11779
+ "bcp_47": "en",
11780
+ "task": "translation",
11781
+ "metric": "bleu",
11782
+ "score": 0.3057719571177098,
11783
+ "sentence_nr": 14.5
11784
+ },
11785
+ {
11786
+ "model": "mistralai/mistral-nemo",
11787
+ "bcp_47": "en",
11788
+ "task": "translation",
11789
+ "metric": "chrf",
11790
+ "score": 0.45969934521843914,
11791
+ "sentence_nr": 14.5
11792
+ },
11793
+ {
11794
+ "model": "mistralai/mistral-nemo",
11795
+ "bcp_47": "zh",
11796
+ "task": "classification",
11797
+ "metric": "accuracy",
11798
+ "score": 0.5333333333333333,
11799
+ "sentence_nr": 14.5
11800
+ },
11801
+ {
11802
+ "model": "mistralai/mistral-nemo",
11803
+ "bcp_47": "zh",
11804
+ "task": "language_modeling",
11805
+ "metric": "chrf",
11806
+ "score": 0.8247133394312195,
11807
+ "sentence_nr": 14.5
11808
+ },
11809
+ {
11810
+ "model": "mistralai/mistral-nemo",
11811
+ "bcp_47": "zh",
11812
+ "task": "translation",
11813
+ "metric": "bleu",
11814
+ "score": 0.32971687049116577,
11815
+ "sentence_nr": 14.5
11816
+ },
11817
+ {
11818
+ "model": "mistralai/mistral-nemo",
11819
+ "bcp_47": "zh",
11820
+ "task": "translation",
11821
+ "metric": "chrf",
11822
+ "score": 0.5266852291276966,
11823
+ "sentence_nr": 14.5
11824
+ },
11825
  {
11826
  "model": "mistralai/mistral-small-24b-instruct-2501",
11827
  "bcp_47": "en",
 
11949
  "metric": "chrf",
11950
  "score": 0.559410465345808,
11951
  "sentence_nr": 14.5
11952
+ },
11953
+ {
11954
+ "model": "qwen/qwq-32b",
11955
+ "bcp_47": "en",
11956
+ "task": "classification",
11957
+ "metric": "accuracy",
11958
+ "score": 0.0,
11959
+ "sentence_nr": 14.5
11960
+ },
11961
+ {
11962
+ "model": "qwen/qwq-32b",
11963
+ "bcp_47": "en",
11964
+ "task": "language_modeling",
11965
+ "metric": "chrf",
11966
+ "score": 0.6047457400839834,
11967
+ "sentence_nr": 14.5
11968
+ },
11969
+ {
11970
+ "model": "qwen/qwq-32b",
11971
+ "bcp_47": "en",
11972
+ "task": "translation",
11973
+ "metric": "bleu",
11974
+ "score": 0.20068036705764214,
11975
+ "sentence_nr": 14.5
11976
+ },
11977
+ {
11978
+ "model": "qwen/qwq-32b",
11979
+ "bcp_47": "en",
11980
+ "task": "translation",
11981
+ "metric": "chrf",
11982
+ "score": 0.23884729813422853,
11983
+ "sentence_nr": 14.5
11984
+ },
11985
+ {
11986
+ "model": "qwen/qwq-32b",
11987
+ "bcp_47": "zh",
11988
+ "task": "classification",
11989
+ "metric": "accuracy",
11990
+ "score": 0.0,
11991
+ "sentence_nr": 14.5
11992
+ },
11993
+ {
11994
+ "model": "qwen/qwq-32b",
11995
+ "bcp_47": "zh",
11996
+ "task": "language_modeling",
11997
+ "metric": "chrf",
11998
+ "score": 0.35788429123492,
11999
+ "sentence_nr": 14.5
12000
+ },
12001
+ {
12002
+ "model": "qwen/qwq-32b",
12003
+ "bcp_47": "zh",
12004
+ "task": "translation",
12005
+ "metric": "bleu",
12006
+ "score": 0.22828858009816946,
12007
+ "sentence_nr": 14.5
12008
+ },
12009
+ {
12010
+ "model": "qwen/qwq-32b",
12011
+ "bcp_47": "zh",
12012
+ "task": "translation",
12013
+ "metric": "chrf",
12014
+ "score": 0.3698284418118128,
12015
+ "sentence_nr": 14.5
12016
  }
12017
  ]
12018
  }