David Pomerenke
commited on
Commit
·
c5278dd
1
Parent(s):
d1a7111
More models
Browse files- evals/models.py +9 -2
- frontend/public/results.json +125 -20
- results.json +668 -24
evals/models.py
CHANGED
@@ -15,12 +15,19 @@ models = [
|
|
15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
17 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
|
|
|
|
18 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
|
|
19 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
|
|
|
|
20 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
21 |
-
|
|
|
22 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
23 |
-
"
|
|
|
24 |
]
|
25 |
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
26 |
|
|
|
15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
17 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
18 |
+
"meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
|
19 |
+
"meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
|
20 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
21 |
+
"mistralai/mistral-nemo",
|
22 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
23 |
+
"google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
|
24 |
+
"google/gemma-3-27b-it", # 0.2$/M tokens
|
25 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
26 |
+
"qwen/qwq-32b",
|
27 |
+
# "deepseek/deepseek-chat", # 1.3$/M tokens
|
28 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
29 |
+
"microsoft/phi-4-multimodal-instruct",
|
30 |
+
"amazon/nova-micro-v1", # 0.09$/M tokens
|
31 |
]
|
32 |
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
33 |
|
frontend/public/results.json
CHANGED
@@ -18,6 +18,21 @@
|
|
18 |
{
|
19 |
"rank": 2,
|
20 |
"provider": "Google",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"model": "Gemma 3 27b It",
|
22 |
"hf_id": "google/gemma-3-27b-it",
|
23 |
"creation_date": "2025-03-01",
|
@@ -31,7 +46,52 @@
|
|
31 |
"translation_chrf": 0.54
|
32 |
},
|
33 |
{
|
34 |
-
"rank":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
"provider": "OpenAI",
|
36 |
"model": "GPT 4o Mini",
|
37 |
"hf_id": null,
|
@@ -46,7 +106,7 @@
|
|
46 |
"translation_chrf": 0.55
|
47 |
},
|
48 |
{
|
49 |
-
"rank":
|
50 |
"provider": "MistralAI",
|
51 |
"model": "Mistral Small 24b Instruct 2501",
|
52 |
"hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
|
@@ -61,7 +121,7 @@
|
|
61 |
"translation_chrf": 0.52
|
62 |
},
|
63 |
{
|
64 |
-
"rank":
|
65 |
"provider": "Meta Llama",
|
66 |
"model": "Llama 3.3 70b Instruct",
|
67 |
"hf_id": "meta-llama/Llama-3.3-70B-Instruct",
|
@@ -74,6 +134,51 @@
|
|
74 |
"language_modeling_chrf": 0.94,
|
75 |
"translation_bleu": 0.31,
|
76 |
"translation_chrf": 0.48
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
}
|
78 |
],
|
79 |
"language_table": [
|
@@ -81,25 +186,13 @@
|
|
81 |
"language_name": "English",
|
82 |
"speakers": 1636485840,
|
83 |
"family": "Indo-European",
|
84 |
-
"average": 0.
|
85 |
"in_benchmark": true,
|
86 |
"NaN": 0.0,
|
87 |
-
"classification_accuracy": 0.
|
88 |
-
"language_modeling_chrf": 0.
|
89 |
-
"translation_bleu": 0.
|
90 |
-
"translation_chrf": 0.
|
91 |
-
},
|
92 |
-
{
|
93 |
-
"language_name": "Chinese",
|
94 |
-
"speakers": 1304678914,
|
95 |
-
"family": "Sino-Tibetan",
|
96 |
-
"average": 0.5,
|
97 |
-
"in_benchmark": true,
|
98 |
-
"NaN": 0.0,
|
99 |
-
"classification_accuracy": 0.65,
|
100 |
-
"language_modeling_chrf": 0.93,
|
101 |
-
"translation_bleu": 0.38,
|
102 |
-
"translation_chrf": 0.55
|
103 |
},
|
104 |
{
|
105 |
"language_name": "French",
|
@@ -113,6 +206,18 @@
|
|
113 |
"translation_bleu": 0.32,
|
114 |
"translation_chrf": 0.49
|
115 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
{
|
117 |
"language_name": "Hindi",
|
118 |
"speakers": 546882144,
|
|
|
18 |
{
|
19 |
"rank": 2,
|
20 |
"provider": "Google",
|
21 |
+
"model": "Gemini 2.0 Flash Lite 001",
|
22 |
+
"hf_id": null,
|
23 |
+
"creation_date": null,
|
24 |
+
"size": null,
|
25 |
+
"type": "Commercial",
|
26 |
+
"license": null,
|
27 |
+
"average": 0.66,
|
28 |
+
"classification_accuracy": 0.73,
|
29 |
+
"language_modeling_chrf": 0.97,
|
30 |
+
"translation_bleu": 0.4,
|
31 |
+
"translation_chrf": 0.54
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"rank": 3,
|
35 |
+
"provider": "Google",
|
36 |
"model": "Gemma 3 27b It",
|
37 |
"hf_id": "google/gemma-3-27b-it",
|
38 |
"creation_date": "2025-03-01",
|
|
|
46 |
"translation_chrf": 0.54
|
47 |
},
|
48 |
{
|
49 |
+
"rank": 4,
|
50 |
+
"provider": "Meta Llama",
|
51 |
+
"model": "Llama 3.1 70b Instruct",
|
52 |
+
"hf_id": "meta-llama/Llama-3.1-70B-Instruct",
|
53 |
+
"creation_date": "2024-07-16",
|
54 |
+
"size": 70553706496.0,
|
55 |
+
"type": "Open",
|
56 |
+
"license": "Llama3.1",
|
57 |
+
"average": 0.62,
|
58 |
+
"classification_accuracy": 0.57,
|
59 |
+
"language_modeling_chrf": 0.92,
|
60 |
+
"translation_bleu": 0.43,
|
61 |
+
"translation_chrf": 0.57
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"rank": 5,
|
65 |
+
"provider": "Amazon",
|
66 |
+
"model": "Nova Micro V1",
|
67 |
+
"hf_id": null,
|
68 |
+
"creation_date": null,
|
69 |
+
"size": null,
|
70 |
+
"type": "Commercial",
|
71 |
+
"license": null,
|
72 |
+
"average": 0.61,
|
73 |
+
"classification_accuracy": 0.52,
|
74 |
+
"language_modeling_chrf": 0.94,
|
75 |
+
"translation_bleu": 0.4,
|
76 |
+
"translation_chrf": 0.56
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"rank": 6,
|
80 |
+
"provider": "Meta Llama",
|
81 |
+
"model": "Llama 3 70b Instruct",
|
82 |
+
"hf_id": null,
|
83 |
+
"creation_date": null,
|
84 |
+
"size": null,
|
85 |
+
"type": "Commercial",
|
86 |
+
"license": null,
|
87 |
+
"average": 0.61,
|
88 |
+
"classification_accuracy": 0.8,
|
89 |
+
"language_modeling_chrf": 0.95,
|
90 |
+
"translation_bleu": 0.25,
|
91 |
+
"translation_chrf": 0.43
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"rank": 7,
|
95 |
"provider": "OpenAI",
|
96 |
"model": "GPT 4o Mini",
|
97 |
"hf_id": null,
|
|
|
106 |
"translation_chrf": 0.55
|
107 |
},
|
108 |
{
|
109 |
+
"rank": 8,
|
110 |
"provider": "MistralAI",
|
111 |
"model": "Mistral Small 24b Instruct 2501",
|
112 |
"hf_id": "mistralai/Mistral-Small-24B-Instruct-2501",
|
|
|
121 |
"translation_chrf": 0.52
|
122 |
},
|
123 |
{
|
124 |
+
"rank": 9,
|
125 |
"provider": "Meta Llama",
|
126 |
"model": "Llama 3.3 70b Instruct",
|
127 |
"hf_id": "meta-llama/Llama-3.3-70B-Instruct",
|
|
|
134 |
"language_modeling_chrf": 0.94,
|
135 |
"translation_bleu": 0.31,
|
136 |
"translation_chrf": 0.48
|
137 |
+
},
|
138 |
+
{
|
139 |
+
"rank": 10,
|
140 |
+
"provider": "MistralAI",
|
141 |
+
"model": "Mistral Nemo",
|
142 |
+
"hf_id": null,
|
143 |
+
"creation_date": null,
|
144 |
+
"size": null,
|
145 |
+
"type": "Commercial",
|
146 |
+
"license": null,
|
147 |
+
"average": 0.55,
|
148 |
+
"classification_accuracy": 0.5,
|
149 |
+
"language_modeling_chrf": 0.88,
|
150 |
+
"translation_bleu": 0.32,
|
151 |
+
"translation_chrf": 0.49
|
152 |
+
},
|
153 |
+
{
|
154 |
+
"rank": 11,
|
155 |
+
"provider": "Microsoft",
|
156 |
+
"model": "Phi 4 Multimodal Instruct",
|
157 |
+
"hf_id": "microsoft/Phi-4-multimodal-instruct",
|
158 |
+
"creation_date": "2025-02-24",
|
159 |
+
"size": 5574460384.0,
|
160 |
+
"type": "Open",
|
161 |
+
"license": "Mit",
|
162 |
+
"average": 0.52,
|
163 |
+
"classification_accuracy": 0.42,
|
164 |
+
"language_modeling_chrf": 0.87,
|
165 |
+
"translation_bleu": 0.32,
|
166 |
+
"translation_chrf": 0.46
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"rank": 12,
|
170 |
+
"provider": "Qwen",
|
171 |
+
"model": "Qwq 32b",
|
172 |
+
"hf_id": "Qwen/QwQ-32B",
|
173 |
+
"creation_date": "2025-03-05",
|
174 |
+
"size": 32763876352.0,
|
175 |
+
"type": "Open",
|
176 |
+
"license": "Apache 2.0",
|
177 |
+
"average": 0.25,
|
178 |
+
"classification_accuracy": 0.0,
|
179 |
+
"language_modeling_chrf": 0.48,
|
180 |
+
"translation_bleu": 0.21,
|
181 |
+
"translation_chrf": 0.3
|
182 |
}
|
183 |
],
|
184 |
"language_table": [
|
|
|
186 |
"language_name": "English",
|
187 |
"speakers": 1636485840,
|
188 |
"family": "Indo-European",
|
189 |
+
"average": 0.47,
|
190 |
"in_benchmark": true,
|
191 |
"NaN": 0.0,
|
192 |
+
"classification_accuracy": 0.58,
|
193 |
+
"language_modeling_chrf": 0.92,
|
194 |
+
"translation_bleu": 0.37,
|
195 |
+
"translation_chrf": 0.49
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
},
|
197 |
{
|
198 |
"language_name": "French",
|
|
|
206 |
"translation_bleu": 0.32,
|
207 |
"translation_chrf": 0.49
|
208 |
},
|
209 |
+
{
|
210 |
+
"language_name": "Chinese",
|
211 |
+
"speakers": 1304678914,
|
212 |
+
"family": "Sino-Tibetan",
|
213 |
+
"average": 0.46,
|
214 |
+
"in_benchmark": true,
|
215 |
+
"NaN": 0.0,
|
216 |
+
"classification_accuracy": 0.55,
|
217 |
+
"language_modeling_chrf": 0.86,
|
218 |
+
"translation_bleu": 0.35,
|
219 |
+
"translation_chrf": 0.53
|
220 |
+
},
|
221 |
{
|
222 |
"language_name": "Hindi",
|
223 |
"speakers": 546882144,
|
results.json
CHANGED
@@ -3,33 +3,61 @@
|
|
3 |
{
|
4 |
"task": "classification",
|
5 |
"metric": "accuracy",
|
6 |
-
"score": 0.
|
7 |
"bcp_47": 10,
|
8 |
-
"model":
|
9 |
},
|
10 |
{
|
11 |
"task": "language_modeling",
|
12 |
"metric": "chrf",
|
13 |
-
"score": 0.
|
14 |
"bcp_47": 10,
|
15 |
-
"model":
|
16 |
},
|
17 |
{
|
18 |
"task": "translation",
|
19 |
"metric": "bleu",
|
20 |
-
"score": 0.
|
21 |
"bcp_47": 10,
|
22 |
-
"model":
|
23 |
},
|
24 |
{
|
25 |
"task": "translation",
|
26 |
"metric": "chrf",
|
27 |
-
"score": 0.
|
28 |
"bcp_47": 10,
|
29 |
-
"model":
|
30 |
}
|
31 |
],
|
32 |
"models": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
{
|
34 |
"model": "google/gemini-2.0-flash-001",
|
35 |
"task": "classification",
|
@@ -58,6 +86,34 @@
|
|
58 |
"score": 0.5828490054615683,
|
59 |
"bcp_47": 2
|
60 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
{
|
62 |
"model": "google/gemma-3-27b-it",
|
63 |
"task": "classification",
|
@@ -86,6 +142,62 @@
|
|
86 |
"score": 0.5376336154503363,
|
87 |
"bcp_47": 2
|
88 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
{
|
90 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
91 |
"task": "classification",
|
@@ -114,6 +226,62 @@
|
|
114 |
"score": 0.4836914110309717,
|
115 |
"bcp_47": 10
|
116 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
{
|
118 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
119 |
"task": "classification",
|
@@ -169,6 +337,34 @@
|
|
169 |
"metric": "chrf",
|
170 |
"score": 0.5452510379336759,
|
171 |
"bcp_47": 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
}
|
173 |
],
|
174 |
"languages": [
|
@@ -2554,8 +2750,8 @@
|
|
2554 |
"in_benchmark": true,
|
2555 |
"task": "classification",
|
2556 |
"metric": "accuracy",
|
2557 |
-
"score": 0.
|
2558 |
-
"model":
|
2559 |
},
|
2560 |
{
|
2561 |
"bcp_47": "en",
|
@@ -2569,8 +2765,8 @@
|
|
2569 |
"in_benchmark": true,
|
2570 |
"task": "language_modeling",
|
2571 |
"metric": "chrf",
|
2572 |
-
"score": 0.
|
2573 |
-
"model":
|
2574 |
},
|
2575 |
{
|
2576 |
"bcp_47": "en",
|
@@ -2584,8 +2780,8 @@
|
|
2584 |
"in_benchmark": true,
|
2585 |
"task": "translation",
|
2586 |
"metric": "bleu",
|
2587 |
-
"score": 0.
|
2588 |
-
"model":
|
2589 |
},
|
2590 |
{
|
2591 |
"bcp_47": "en",
|
@@ -2599,8 +2795,8 @@
|
|
2599 |
"in_benchmark": true,
|
2600 |
"task": "translation",
|
2601 |
"metric": "chrf",
|
2602 |
-
"score": 0.
|
2603 |
-
"model":
|
2604 |
},
|
2605 |
{
|
2606 |
"bcp_47": "eo",
|
@@ -10699,8 +10895,8 @@
|
|
10699 |
"in_benchmark": true,
|
10700 |
"task": "classification",
|
10701 |
"metric": "accuracy",
|
10702 |
-
"score": 0.
|
10703 |
-
"model":
|
10704 |
},
|
10705 |
{
|
10706 |
"bcp_47": "zh",
|
@@ -10714,8 +10910,8 @@
|
|
10714 |
"in_benchmark": true,
|
10715 |
"task": "language_modeling",
|
10716 |
"metric": "chrf",
|
10717 |
-
"score": 0.
|
10718 |
-
"model":
|
10719 |
},
|
10720 |
{
|
10721 |
"bcp_47": "zh",
|
@@ -10729,8 +10925,8 @@
|
|
10729 |
"in_benchmark": true,
|
10730 |
"task": "translation",
|
10731 |
"metric": "bleu",
|
10732 |
-
"score": 0.
|
10733 |
-
"model":
|
10734 |
},
|
10735 |
{
|
10736 |
"bcp_47": "zh",
|
@@ -10744,8 +10940,8 @@
|
|
10744 |
"in_benchmark": true,
|
10745 |
"task": "translation",
|
10746 |
"metric": "chrf",
|
10747 |
-
"score": 0.
|
10748 |
-
"model":
|
10749 |
},
|
10750 |
{
|
10751 |
"bcp_47": "zmi",
|
@@ -10794,6 +10990,70 @@
|
|
10794 |
}
|
10795 |
],
|
10796 |
"scores": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10797 |
{
|
10798 |
"model": "google/gemini-2.0-flash-001",
|
10799 |
"bcp_47": "en",
|
@@ -10858,6 +11118,70 @@
|
|
10858 |
"score": 0.5606266861920302,
|
10859 |
"sentence_nr": 14.5
|
10860 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10861 |
{
|
10862 |
"model": "google/gemma-3-27b-it",
|
10863 |
"bcp_47": "en",
|
@@ -10922,6 +11246,134 @@
|
|
10922 |
"score": 0.520771580386218,
|
10923 |
"sentence_nr": 14.5
|
10924 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10925 |
{
|
10926 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
10927 |
"bcp_47": "ar",
|
@@ -11242,6 +11694,134 @@
|
|
11242 |
"score": 0.5862284100611604,
|
11243 |
"sentence_nr": 14.5
|
11244 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11245 |
{
|
11246 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
11247 |
"bcp_47": "en",
|
@@ -11369,6 +11949,70 @@
|
|
11369 |
"metric": "chrf",
|
11370 |
"score": 0.559410465345808,
|
11371 |
"sentence_nr": 14.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11372 |
}
|
11373 |
]
|
11374 |
}
|
|
|
3 |
{
|
4 |
"task": "classification",
|
5 |
"metric": "accuracy",
|
6 |
+
"score": 0.5427083333333333,
|
7 |
"bcp_47": 10,
|
8 |
+
"model": 12
|
9 |
},
|
10 |
{
|
11 |
"task": "language_modeling",
|
12 |
"metric": "chrf",
|
13 |
+
"score": 0.9024222998985517,
|
14 |
"bcp_47": 10,
|
15 |
+
"model": 12
|
16 |
},
|
17 |
{
|
18 |
"task": "translation",
|
19 |
"metric": "bleu",
|
20 |
+
"score": 0.34325329881872996,
|
21 |
"bcp_47": 10,
|
22 |
+
"model": 12
|
23 |
},
|
24 |
{
|
25 |
"task": "translation",
|
26 |
"metric": "chrf",
|
27 |
+
"score": 0.49764810942023735,
|
28 |
"bcp_47": 10,
|
29 |
+
"model": 12
|
30 |
}
|
31 |
],
|
32 |
"models": [
|
33 |
+
{
|
34 |
+
"model": "amazon/nova-micro-v1",
|
35 |
+
"task": "classification",
|
36 |
+
"metric": "accuracy",
|
37 |
+
"score": 0.5166666666666666,
|
38 |
+
"bcp_47": 2
|
39 |
+
},
|
40 |
+
{
|
41 |
+
"model": "amazon/nova-micro-v1",
|
42 |
+
"task": "language_modeling",
|
43 |
+
"metric": "chrf",
|
44 |
+
"score": 0.9446198732700857,
|
45 |
+
"bcp_47": 2
|
46 |
+
},
|
47 |
+
{
|
48 |
+
"model": "amazon/nova-micro-v1",
|
49 |
+
"task": "translation",
|
50 |
+
"metric": "bleu",
|
51 |
+
"score": 0.40042093531509637,
|
52 |
+
"bcp_47": 2
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"model": "amazon/nova-micro-v1",
|
56 |
+
"task": "translation",
|
57 |
+
"metric": "chrf",
|
58 |
+
"score": 0.5642142196700637,
|
59 |
+
"bcp_47": 2
|
60 |
+
},
|
61 |
{
|
62 |
"model": "google/gemini-2.0-flash-001",
|
63 |
"task": "classification",
|
|
|
86 |
"score": 0.5828490054615683,
|
87 |
"bcp_47": 2
|
88 |
},
|
89 |
+
{
|
90 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
91 |
+
"task": "classification",
|
92 |
+
"metric": "accuracy",
|
93 |
+
"score": 0.7333333333333333,
|
94 |
+
"bcp_47": 2
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
98 |
+
"task": "language_modeling",
|
99 |
+
"metric": "chrf",
|
100 |
+
"score": 0.9710194350890375,
|
101 |
+
"bcp_47": 2
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
105 |
+
"task": "translation",
|
106 |
+
"metric": "bleu",
|
107 |
+
"score": 0.40085159165111883,
|
108 |
+
"bcp_47": 2
|
109 |
+
},
|
110 |
+
{
|
111 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
112 |
+
"task": "translation",
|
113 |
+
"metric": "chrf",
|
114 |
+
"score": 0.5422821788946908,
|
115 |
+
"bcp_47": 2
|
116 |
+
},
|
117 |
{
|
118 |
"model": "google/gemma-3-27b-it",
|
119 |
"task": "classification",
|
|
|
142 |
"score": 0.5376336154503363,
|
143 |
"bcp_47": 2
|
144 |
},
|
145 |
+
{
|
146 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
147 |
+
"task": "classification",
|
148 |
+
"metric": "accuracy",
|
149 |
+
"score": 0.8,
|
150 |
+
"bcp_47": 2
|
151 |
+
},
|
152 |
+
{
|
153 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
154 |
+
"task": "language_modeling",
|
155 |
+
"metric": "chrf",
|
156 |
+
"score": 0.9452435586756014,
|
157 |
+
"bcp_47": 2
|
158 |
+
},
|
159 |
+
{
|
160 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
161 |
+
"task": "translation",
|
162 |
+
"metric": "bleu",
|
163 |
+
"score": 0.25148401884229143,
|
164 |
+
"bcp_47": 2
|
165 |
+
},
|
166 |
+
{
|
167 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
168 |
+
"task": "translation",
|
169 |
+
"metric": "chrf",
|
170 |
+
"score": 0.4285750600098188,
|
171 |
+
"bcp_47": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
175 |
+
"task": "classification",
|
176 |
+
"metric": "accuracy",
|
177 |
+
"score": 0.5666666666666667,
|
178 |
+
"bcp_47": 2
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
182 |
+
"task": "language_modeling",
|
183 |
+
"metric": "chrf",
|
184 |
+
"score": 0.9203465184571391,
|
185 |
+
"bcp_47": 2
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
189 |
+
"task": "translation",
|
190 |
+
"metric": "bleu",
|
191 |
+
"score": 0.43182300663190504,
|
192 |
+
"bcp_47": 2
|
193 |
+
},
|
194 |
+
{
|
195 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
196 |
+
"task": "translation",
|
197 |
+
"metric": "chrf",
|
198 |
+
"score": 0.5679592059634284,
|
199 |
+
"bcp_47": 2
|
200 |
+
},
|
201 |
{
|
202 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
203 |
"task": "classification",
|
|
|
226 |
"score": 0.4836914110309717,
|
227 |
"bcp_47": 10
|
228 |
},
|
229 |
+
{
|
230 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
231 |
+
"task": "classification",
|
232 |
+
"metric": "accuracy",
|
233 |
+
"score": 0.4166666666666667,
|
234 |
+
"bcp_47": 2
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
238 |
+
"task": "language_modeling",
|
239 |
+
"metric": "chrf",
|
240 |
+
"score": 0.8700000415175042,
|
241 |
+
"bcp_47": 2
|
242 |
+
},
|
243 |
+
{
|
244 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
245 |
+
"task": "translation",
|
246 |
+
"metric": "bleu",
|
247 |
+
"score": 0.31733056990581465,
|
248 |
+
"bcp_47": 2
|
249 |
+
},
|
250 |
+
{
|
251 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
252 |
+
"task": "translation",
|
253 |
+
"metric": "chrf",
|
254 |
+
"score": 0.45631576469060464,
|
255 |
+
"bcp_47": 2
|
256 |
+
},
|
257 |
+
{
|
258 |
+
"model": "mistralai/mistral-nemo",
|
259 |
+
"task": "classification",
|
260 |
+
"metric": "accuracy",
|
261 |
+
"score": 0.5,
|
262 |
+
"bcp_47": 2
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"model": "mistralai/mistral-nemo",
|
266 |
+
"task": "language_modeling",
|
267 |
+
"metric": "chrf",
|
268 |
+
"score": 0.8815544644693022,
|
269 |
+
"bcp_47": 2
|
270 |
+
},
|
271 |
+
{
|
272 |
+
"model": "mistralai/mistral-nemo",
|
273 |
+
"task": "translation",
|
274 |
+
"metric": "bleu",
|
275 |
+
"score": 0.3177444138044378,
|
276 |
+
"bcp_47": 2
|
277 |
+
},
|
278 |
+
{
|
279 |
+
"model": "mistralai/mistral-nemo",
|
280 |
+
"task": "translation",
|
281 |
+
"metric": "chrf",
|
282 |
+
"score": 0.49319228717306784,
|
283 |
+
"bcp_47": 2
|
284 |
+
},
|
285 |
{
|
286 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
287 |
"task": "classification",
|
|
|
337 |
"metric": "chrf",
|
338 |
"score": 0.5452510379336759,
|
339 |
"bcp_47": 2
|
340 |
+
},
|
341 |
+
{
|
342 |
+
"model": "qwen/qwq-32b",
|
343 |
+
"task": "classification",
|
344 |
+
"metric": "accuracy",
|
345 |
+
"score": 0.0,
|
346 |
+
"bcp_47": 2
|
347 |
+
},
|
348 |
+
{
|
349 |
+
"model": "qwen/qwq-32b",
|
350 |
+
"task": "language_modeling",
|
351 |
+
"metric": "chrf",
|
352 |
+
"score": 0.4813150156594517,
|
353 |
+
"bcp_47": 2
|
354 |
+
},
|
355 |
+
{
|
356 |
+
"model": "qwen/qwq-32b",
|
357 |
+
"task": "translation",
|
358 |
+
"metric": "bleu",
|
359 |
+
"score": 0.2144844735779058,
|
360 |
+
"bcp_47": 2
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"model": "qwen/qwq-32b",
|
364 |
+
"task": "translation",
|
365 |
+
"metric": "chrf",
|
366 |
+
"score": 0.30433786997302065,
|
367 |
+
"bcp_47": 2
|
368 |
}
|
369 |
],
|
370 |
"languages": [
|
|
|
2750 |
"in_benchmark": true,
|
2751 |
"task": "classification",
|
2752 |
"metric": "accuracy",
|
2753 |
+
"score": 0.5777777777777778,
|
2754 |
+
"model": 12.0
|
2755 |
},
|
2756 |
{
|
2757 |
"bcp_47": "en",
|
|
|
2765 |
"in_benchmark": true,
|
2766 |
"task": "language_modeling",
|
2767 |
"metric": "chrf",
|
2768 |
+
"score": 0.9222343234934963,
|
2769 |
+
"model": 12.0
|
2770 |
},
|
2771 |
{
|
2772 |
"bcp_47": "en",
|
|
|
2780 |
"in_benchmark": true,
|
2781 |
"task": "translation",
|
2782 |
"metric": "bleu",
|
2783 |
+
"score": 0.37035746903842287,
|
2784 |
+
"model": 12.0
|
2785 |
},
|
2786 |
{
|
2787 |
"bcp_47": "en",
|
|
|
2795 |
"in_benchmark": true,
|
2796 |
"task": "translation",
|
2797 |
"metric": "chrf",
|
2798 |
+
"score": 0.4880916692700535,
|
2799 |
+
"model": 12.0
|
2800 |
},
|
2801 |
{
|
2802 |
"bcp_47": "eo",
|
|
|
10895 |
"in_benchmark": true,
|
10896 |
"task": "classification",
|
10897 |
"metric": "accuracy",
|
10898 |
+
"score": 0.5499999999999999,
|
10899 |
+
"model": 12.0
|
10900 |
},
|
10901 |
{
|
10902 |
"bcp_47": "zh",
|
|
|
10910 |
"in_benchmark": true,
|
10911 |
"task": "language_modeling",
|
10912 |
"metric": "chrf",
|
10913 |
+
"score": 0.8599948525016986,
|
10914 |
+
"model": 12.0
|
10915 |
},
|
10916 |
{
|
10917 |
"bcp_47": "zh",
|
|
|
10925 |
"in_benchmark": true,
|
10926 |
"task": "translation",
|
10927 |
"metric": "bleu",
|
10928 |
+
"score": 0.3532292543512247,
|
10929 |
+
"model": 12.0
|
10930 |
},
|
10931 |
{
|
10932 |
"bcp_47": "zh",
|
|
|
10940 |
"in_benchmark": true,
|
10941 |
"task": "translation",
|
10942 |
"metric": "chrf",
|
10943 |
+
"score": 0.529398790799104,
|
10944 |
+
"model": 12.0
|
10945 |
},
|
10946 |
{
|
10947 |
"bcp_47": "zmi",
|
|
|
10990 |
}
|
10991 |
],
|
10992 |
"scores": [
|
10993 |
+
{
|
10994 |
+
"model": "amazon/nova-micro-v1",
|
10995 |
+
"bcp_47": "en",
|
10996 |
+
"task": "classification",
|
10997 |
+
"metric": "accuracy",
|
10998 |
+
"score": 0.5333333333333333,
|
10999 |
+
"sentence_nr": 14.5
|
11000 |
+
},
|
11001 |
+
{
|
11002 |
+
"model": "amazon/nova-micro-v1",
|
11003 |
+
"bcp_47": "en",
|
11004 |
+
"task": "language_modeling",
|
11005 |
+
"metric": "chrf",
|
11006 |
+
"score": 0.9725001956658679,
|
11007 |
+
"sentence_nr": 14.5
|
11008 |
+
},
|
11009 |
+
{
|
11010 |
+
"model": "amazon/nova-micro-v1",
|
11011 |
+
"bcp_47": "en",
|
11012 |
+
"task": "translation",
|
11013 |
+
"metric": "bleu",
|
11014 |
+
"score": 0.4491277841667736,
|
11015 |
+
"sentence_nr": 14.5
|
11016 |
+
},
|
11017 |
+
{
|
11018 |
+
"model": "amazon/nova-micro-v1",
|
11019 |
+
"bcp_47": "en",
|
11020 |
+
"task": "translation",
|
11021 |
+
"metric": "chrf",
|
11022 |
+
"score": 0.5740458676508566,
|
11023 |
+
"sentence_nr": 14.5
|
11024 |
+
},
|
11025 |
+
{
|
11026 |
+
"model": "amazon/nova-micro-v1",
|
11027 |
+
"bcp_47": "zh",
|
11028 |
+
"task": "classification",
|
11029 |
+
"metric": "accuracy",
|
11030 |
+
"score": 0.5,
|
11031 |
+
"sentence_nr": 14.5
|
11032 |
+
},
|
11033 |
+
{
|
11034 |
+
"model": "amazon/nova-micro-v1",
|
11035 |
+
"bcp_47": "zh",
|
11036 |
+
"task": "language_modeling",
|
11037 |
+
"metric": "chrf",
|
11038 |
+
"score": 0.9167395508743035,
|
11039 |
+
"sentence_nr": 14.5
|
11040 |
+
},
|
11041 |
+
{
|
11042 |
+
"model": "amazon/nova-micro-v1",
|
11043 |
+
"bcp_47": "zh",
|
11044 |
+
"task": "translation",
|
11045 |
+
"metric": "bleu",
|
11046 |
+
"score": 0.3517140864634192,
|
11047 |
+
"sentence_nr": 14.5
|
11048 |
+
},
|
11049 |
+
{
|
11050 |
+
"model": "amazon/nova-micro-v1",
|
11051 |
+
"bcp_47": "zh",
|
11052 |
+
"task": "translation",
|
11053 |
+
"metric": "chrf",
|
11054 |
+
"score": 0.5543825716892707,
|
11055 |
+
"sentence_nr": 14.5
|
11056 |
+
},
|
11057 |
{
|
11058 |
"model": "google/gemini-2.0-flash-001",
|
11059 |
"bcp_47": "en",
|
|
|
11118 |
"score": 0.5606266861920302,
|
11119 |
"sentence_nr": 14.5
|
11120 |
},
|
11121 |
+
{
|
11122 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11123 |
+
"bcp_47": "en",
|
11124 |
+
"task": "classification",
|
11125 |
+
"metric": "accuracy",
|
11126 |
+
"score": 0.7333333333333333,
|
11127 |
+
"sentence_nr": 14.5
|
11128 |
+
},
|
11129 |
+
{
|
11130 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11131 |
+
"bcp_47": "en",
|
11132 |
+
"task": "language_modeling",
|
11133 |
+
"metric": "chrf",
|
11134 |
+
"score": 0.990925430282282,
|
11135 |
+
"sentence_nr": 14.5
|
11136 |
+
},
|
11137 |
+
{
|
11138 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11139 |
+
"bcp_47": "en",
|
11140 |
+
"task": "translation",
|
11141 |
+
"metric": "bleu",
|
11142 |
+
"score": 0.37911136698810943,
|
11143 |
+
"sentence_nr": 14.5
|
11144 |
+
},
|
11145 |
+
{
|
11146 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11147 |
+
"bcp_47": "en",
|
11148 |
+
"task": "translation",
|
11149 |
+
"metric": "chrf",
|
11150 |
+
"score": 0.5094402087357145,
|
11151 |
+
"sentence_nr": 14.5
|
11152 |
+
},
|
11153 |
+
{
|
11154 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11155 |
+
"bcp_47": "zh",
|
11156 |
+
"task": "classification",
|
11157 |
+
"metric": "accuracy",
|
11158 |
+
"score": 0.7333333333333333,
|
11159 |
+
"sentence_nr": 14.5
|
11160 |
+
},
|
11161 |
+
{
|
11162 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11163 |
+
"bcp_47": "zh",
|
11164 |
+
"task": "language_modeling",
|
11165 |
+
"metric": "chrf",
|
11166 |
+
"score": 0.9511134398957932,
|
11167 |
+
"sentence_nr": 14.5
|
11168 |
+
},
|
11169 |
+
{
|
11170 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11171 |
+
"bcp_47": "zh",
|
11172 |
+
"task": "translation",
|
11173 |
+
"metric": "bleu",
|
11174 |
+
"score": 0.4225918163141283,
|
11175 |
+
"sentence_nr": 14.5
|
11176 |
+
},
|
11177 |
+
{
|
11178 |
+
"model": "google/gemini-2.0-flash-lite-001",
|
11179 |
+
"bcp_47": "zh",
|
11180 |
+
"task": "translation",
|
11181 |
+
"metric": "chrf",
|
11182 |
+
"score": 0.5751241490536672,
|
11183 |
+
"sentence_nr": 14.5
|
11184 |
+
},
|
11185 |
{
|
11186 |
"model": "google/gemma-3-27b-it",
|
11187 |
"bcp_47": "en",
|
|
|
11246 |
"score": 0.520771580386218,
|
11247 |
"sentence_nr": 14.5
|
11248 |
},
|
11249 |
+
{
|
11250 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11251 |
+
"bcp_47": "en",
|
11252 |
+
"task": "classification",
|
11253 |
+
"metric": "accuracy",
|
11254 |
+
"score": 0.8333333333333334,
|
11255 |
+
"sentence_nr": 14.5
|
11256 |
+
},
|
11257 |
+
{
|
11258 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11259 |
+
"bcp_47": "en",
|
11260 |
+
"task": "language_modeling",
|
11261 |
+
"metric": "chrf",
|
11262 |
+
"score": 0.9674315682816375,
|
11263 |
+
"sentence_nr": 14.5
|
11264 |
+
},
|
11265 |
+
{
|
11266 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11267 |
+
"bcp_47": "en",
|
11268 |
+
"task": "translation",
|
11269 |
+
"metric": "bleu",
|
11270 |
+
"score": 0.18722412351358647,
|
11271 |
+
"sentence_nr": 14.5
|
11272 |
+
},
|
11273 |
+
{
|
11274 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11275 |
+
"bcp_47": "en",
|
11276 |
+
"task": "translation",
|
11277 |
+
"metric": "chrf",
|
11278 |
+
"score": 0.34151371128305424,
|
11279 |
+
"sentence_nr": 14.5
|
11280 |
+
},
|
11281 |
+
{
|
11282 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11283 |
+
"bcp_47": "zh",
|
11284 |
+
"task": "classification",
|
11285 |
+
"metric": "accuracy",
|
11286 |
+
"score": 0.7666666666666667,
|
11287 |
+
"sentence_nr": 14.5
|
11288 |
+
},
|
11289 |
+
{
|
11290 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11291 |
+
"bcp_47": "zh",
|
11292 |
+
"task": "language_modeling",
|
11293 |
+
"metric": "chrf",
|
11294 |
+
"score": 0.9230555490695652,
|
11295 |
+
"sentence_nr": 14.5
|
11296 |
+
},
|
11297 |
+
{
|
11298 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11299 |
+
"bcp_47": "zh",
|
11300 |
+
"task": "translation",
|
11301 |
+
"metric": "bleu",
|
11302 |
+
"score": 0.3157439141709964,
|
11303 |
+
"sentence_nr": 14.5
|
11304 |
+
},
|
11305 |
+
{
|
11306 |
+
"model": "meta-llama/llama-3-70b-instruct",
|
11307 |
+
"bcp_47": "zh",
|
11308 |
+
"task": "translation",
|
11309 |
+
"metric": "chrf",
|
11310 |
+
"score": 0.5156364087365835,
|
11311 |
+
"sentence_nr": 14.5
|
11312 |
+
},
|
11313 |
+
{
|
11314 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11315 |
+
"bcp_47": "en",
|
11316 |
+
"task": "classification",
|
11317 |
+
"metric": "accuracy",
|
11318 |
+
"score": 0.7,
|
11319 |
+
"sentence_nr": 14.5
|
11320 |
+
},
|
11321 |
+
{
|
11322 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11323 |
+
"bcp_47": "en",
|
11324 |
+
"task": "language_modeling",
|
11325 |
+
"metric": "chrf",
|
11326 |
+
"score": 0.9701295103188484,
|
11327 |
+
"sentence_nr": 14.5
|
11328 |
+
},
|
11329 |
+
{
|
11330 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11331 |
+
"bcp_47": "en",
|
11332 |
+
"task": "translation",
|
11333 |
+
"metric": "bleu",
|
11334 |
+
"score": 0.44443705644214526,
|
11335 |
+
"sentence_nr": 14.5
|
11336 |
+
},
|
11337 |
+
{
|
11338 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11339 |
+
"bcp_47": "en",
|
11340 |
+
"task": "translation",
|
11341 |
+
"metric": "chrf",
|
11342 |
+
"score": 0.5485685299214524,
|
11343 |
+
"sentence_nr": 14.5
|
11344 |
+
},
|
11345 |
+
{
|
11346 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11347 |
+
"bcp_47": "zh",
|
11348 |
+
"task": "classification",
|
11349 |
+
"metric": "accuracy",
|
11350 |
+
"score": 0.43333333333333335,
|
11351 |
+
"sentence_nr": 14.5
|
11352 |
+
},
|
11353 |
+
{
|
11354 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11355 |
+
"bcp_47": "zh",
|
11356 |
+
"task": "language_modeling",
|
11357 |
+
"metric": "chrf",
|
11358 |
+
"score": 0.8705635265954298,
|
11359 |
+
"sentence_nr": 14.5
|
11360 |
+
},
|
11361 |
+
{
|
11362 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11363 |
+
"bcp_47": "zh",
|
11364 |
+
"task": "translation",
|
11365 |
+
"metric": "bleu",
|
11366 |
+
"score": 0.4192089568216648,
|
11367 |
+
"sentence_nr": 14.5
|
11368 |
+
},
|
11369 |
+
{
|
11370 |
+
"model": "meta-llama/llama-3.1-70b-instruct",
|
11371 |
+
"bcp_47": "zh",
|
11372 |
+
"task": "translation",
|
11373 |
+
"metric": "chrf",
|
11374 |
+
"score": 0.5873498820054043,
|
11375 |
+
"sentence_nr": 14.5
|
11376 |
+
},
|
11377 |
{
|
11378 |
"model": "meta-llama/llama-3.3-70b-instruct",
|
11379 |
"bcp_47": "ar",
|
|
|
11694 |
"score": 0.5862284100611604,
|
11695 |
"sentence_nr": 14.5
|
11696 |
},
|
11697 |
+
{
|
11698 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11699 |
+
"bcp_47": "en",
|
11700 |
+
"task": "classification",
|
11701 |
+
"metric": "accuracy",
|
11702 |
+
"score": 0.43333333333333335,
|
11703 |
+
"sentence_nr": 14.5
|
11704 |
+
},
|
11705 |
+
{
|
11706 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11707 |
+
"bcp_47": "en",
|
11708 |
+
"task": "language_modeling",
|
11709 |
+
"metric": "chrf",
|
11710 |
+
"score": 0.9268050965065061,
|
11711 |
+
"sentence_nr": 14.5
|
11712 |
+
},
|
11713 |
+
{
|
11714 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11715 |
+
"bcp_47": "en",
|
11716 |
+
"task": "translation",
|
11717 |
+
"metric": "bleu",
|
11718 |
+
"score": 0.34049537977839345,
|
11719 |
+
"sentence_nr": 14.5
|
11720 |
+
},
|
11721 |
+
{
|
11722 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11723 |
+
"bcp_47": "en",
|
11724 |
+
"task": "translation",
|
11725 |
+
"metric": "chrf",
|
11726 |
+
"score": 0.4566714452688056,
|
11727 |
+
"sentence_nr": 14.5
|
11728 |
+
},
|
11729 |
+
{
|
11730 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11731 |
+
"bcp_47": "zh",
|
11732 |
+
"task": "classification",
|
11733 |
+
"metric": "accuracy",
|
11734 |
+
"score": 0.4,
|
11735 |
+
"sentence_nr": 14.5
|
11736 |
+
},
|
11737 |
+
{
|
11738 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11739 |
+
"bcp_47": "zh",
|
11740 |
+
"task": "language_modeling",
|
11741 |
+
"metric": "chrf",
|
11742 |
+
"score": 0.8131949865285024,
|
11743 |
+
"sentence_nr": 14.5
|
11744 |
+
},
|
11745 |
+
{
|
11746 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11747 |
+
"bcp_47": "zh",
|
11748 |
+
"task": "translation",
|
11749 |
+
"metric": "bleu",
|
11750 |
+
"score": 0.2941657600332359,
|
11751 |
+
"sentence_nr": 14.5
|
11752 |
+
},
|
11753 |
+
{
|
11754 |
+
"model": "microsoft/phi-4-multimodal-instruct",
|
11755 |
+
"bcp_47": "zh",
|
11756 |
+
"task": "translation",
|
11757 |
+
"metric": "chrf",
|
11758 |
+
"score": 0.4559600841124037,
|
11759 |
+
"sentence_nr": 14.5
|
11760 |
+
},
|
11761 |
+
{
|
11762 |
+
"model": "mistralai/mistral-nemo",
|
11763 |
+
"bcp_47": "en",
|
11764 |
+
"task": "classification",
|
11765 |
+
"metric": "accuracy",
|
11766 |
+
"score": 0.4666666666666667,
|
11767 |
+
"sentence_nr": 14.5
|
11768 |
+
},
|
11769 |
+
{
|
11770 |
+
"model": "mistralai/mistral-nemo",
|
11771 |
+
"bcp_47": "en",
|
11772 |
+
"task": "language_modeling",
|
11773 |
+
"metric": "chrf",
|
11774 |
+
"score": 0.9383955895073849,
|
11775 |
+
"sentence_nr": 14.5
|
11776 |
+
},
|
11777 |
+
{
|
11778 |
+
"model": "mistralai/mistral-nemo",
|
11779 |
+
"bcp_47": "en",
|
11780 |
+
"task": "translation",
|
11781 |
+
"metric": "bleu",
|
11782 |
+
"score": 0.3057719571177098,
|
11783 |
+
"sentence_nr": 14.5
|
11784 |
+
},
|
11785 |
+
{
|
11786 |
+
"model": "mistralai/mistral-nemo",
|
11787 |
+
"bcp_47": "en",
|
11788 |
+
"task": "translation",
|
11789 |
+
"metric": "chrf",
|
11790 |
+
"score": 0.45969934521843914,
|
11791 |
+
"sentence_nr": 14.5
|
11792 |
+
},
|
11793 |
+
{
|
11794 |
+
"model": "mistralai/mistral-nemo",
|
11795 |
+
"bcp_47": "zh",
|
11796 |
+
"task": "classification",
|
11797 |
+
"metric": "accuracy",
|
11798 |
+
"score": 0.5333333333333333,
|
11799 |
+
"sentence_nr": 14.5
|
11800 |
+
},
|
11801 |
+
{
|
11802 |
+
"model": "mistralai/mistral-nemo",
|
11803 |
+
"bcp_47": "zh",
|
11804 |
+
"task": "language_modeling",
|
11805 |
+
"metric": "chrf",
|
11806 |
+
"score": 0.8247133394312195,
|
11807 |
+
"sentence_nr": 14.5
|
11808 |
+
},
|
11809 |
+
{
|
11810 |
+
"model": "mistralai/mistral-nemo",
|
11811 |
+
"bcp_47": "zh",
|
11812 |
+
"task": "translation",
|
11813 |
+
"metric": "bleu",
|
11814 |
+
"score": 0.32971687049116577,
|
11815 |
+
"sentence_nr": 14.5
|
11816 |
+
},
|
11817 |
+
{
|
11818 |
+
"model": "mistralai/mistral-nemo",
|
11819 |
+
"bcp_47": "zh",
|
11820 |
+
"task": "translation",
|
11821 |
+
"metric": "chrf",
|
11822 |
+
"score": 0.5266852291276966,
|
11823 |
+
"sentence_nr": 14.5
|
11824 |
+
},
|
11825 |
{
|
11826 |
"model": "mistralai/mistral-small-24b-instruct-2501",
|
11827 |
"bcp_47": "en",
|
|
|
11949 |
"metric": "chrf",
|
11950 |
"score": 0.559410465345808,
|
11951 |
"sentence_nr": 14.5
|
11952 |
+
},
|
11953 |
+
{
|
11954 |
+
"model": "qwen/qwq-32b",
|
11955 |
+
"bcp_47": "en",
|
11956 |
+
"task": "classification",
|
11957 |
+
"metric": "accuracy",
|
11958 |
+
"score": 0.0,
|
11959 |
+
"sentence_nr": 14.5
|
11960 |
+
},
|
11961 |
+
{
|
11962 |
+
"model": "qwen/qwq-32b",
|
11963 |
+
"bcp_47": "en",
|
11964 |
+
"task": "language_modeling",
|
11965 |
+
"metric": "chrf",
|
11966 |
+
"score": 0.6047457400839834,
|
11967 |
+
"sentence_nr": 14.5
|
11968 |
+
},
|
11969 |
+
{
|
11970 |
+
"model": "qwen/qwq-32b",
|
11971 |
+
"bcp_47": "en",
|
11972 |
+
"task": "translation",
|
11973 |
+
"metric": "bleu",
|
11974 |
+
"score": 0.20068036705764214,
|
11975 |
+
"sentence_nr": 14.5
|
11976 |
+
},
|
11977 |
+
{
|
11978 |
+
"model": "qwen/qwq-32b",
|
11979 |
+
"bcp_47": "en",
|
11980 |
+
"task": "translation",
|
11981 |
+
"metric": "chrf",
|
11982 |
+
"score": 0.23884729813422853,
|
11983 |
+
"sentence_nr": 14.5
|
11984 |
+
},
|
11985 |
+
{
|
11986 |
+
"model": "qwen/qwq-32b",
|
11987 |
+
"bcp_47": "zh",
|
11988 |
+
"task": "classification",
|
11989 |
+
"metric": "accuracy",
|
11990 |
+
"score": 0.0,
|
11991 |
+
"sentence_nr": 14.5
|
11992 |
+
},
|
11993 |
+
{
|
11994 |
+
"model": "qwen/qwq-32b",
|
11995 |
+
"bcp_47": "zh",
|
11996 |
+
"task": "language_modeling",
|
11997 |
+
"metric": "chrf",
|
11998 |
+
"score": 0.35788429123492,
|
11999 |
+
"sentence_nr": 14.5
|
12000 |
+
},
|
12001 |
+
{
|
12002 |
+
"model": "qwen/qwq-32b",
|
12003 |
+
"bcp_47": "zh",
|
12004 |
+
"task": "translation",
|
12005 |
+
"metric": "bleu",
|
12006 |
+
"score": 0.22828858009816946,
|
12007 |
+
"sentence_nr": 14.5
|
12008 |
+
},
|
12009 |
+
{
|
12010 |
+
"model": "qwen/qwq-32b",
|
12011 |
+
"bcp_47": "zh",
|
12012 |
+
"task": "translation",
|
12013 |
+
"metric": "chrf",
|
12014 |
+
"score": 0.3698284418118128,
|
12015 |
+
"sentence_nr": 14.5
|
12016 |
}
|
12017 |
]
|
12018 |
}
|