David Pomerenke
commited on
Commit
·
7fc657e
1
Parent(s):
0c05388
Classification evaluation
Browse files- app.py +3 -0
- evals.py +89 -10
- results.json +0 -0
app.py
CHANGED
@@ -187,6 +187,9 @@ def create_language_stats_df(results):
|
|
187 |
if best_score["bleu"] is not None
|
188 |
else "N/A",
|
189 |
"CommonVoice Hours": commonvoice_link,
|
|
|
|
|
|
|
190 |
}
|
191 |
flat_data.append(row)
|
192 |
|
|
|
187 |
if best_score["bleu"] is not None
|
188 |
else "N/A",
|
189 |
"CommonVoice Hours": commonvoice_link,
|
190 |
+
"Accuracy": round(lang["accuracy"], 3)
|
191 |
+
if lang["accuracy"] is not None
|
192 |
+
else "N/A",
|
193 |
}
|
194 |
flat_data.append(row)
|
195 |
|
evals.py
CHANGED
@@ -15,13 +15,14 @@ from langcodes import Language, standardize_tag
|
|
15 |
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
16 |
from openai import AsyncOpenAI
|
17 |
from requests import get
|
|
|
18 |
from tqdm.asyncio import tqdm_asyncio
|
19 |
from transformers import NllbTokenizer
|
20 |
|
21 |
# config
|
22 |
models = [
|
23 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
24 |
-
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
|
25 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
26 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
27 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
@@ -138,14 +139,14 @@ languages = pd.merge(
|
|
138 |
) # "left" because keep it simple for now
|
139 |
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
140 |
|
141 |
-
languages = languages.sort_values(by="speakers", ascending=False)
|
142 |
|
143 |
# sample languages to translate to
|
144 |
target_languages = languages[languages["in_benchmark"]].sample(
|
145 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
146 |
)
|
147 |
# sample languages to analyze with all models
|
148 |
-
detailed_languages = languages[languages["in_benchmark"]].sample(n=
|
149 |
|
150 |
|
151 |
# utils
|
@@ -213,13 +214,71 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
213 |
}
|
214 |
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
def mean(lst):
|
217 |
return sum(lst) / len(lst) if lst else 0
|
218 |
|
219 |
|
220 |
# evaluation!
|
221 |
async def main():
|
222 |
-
|
|
|
223 |
translate_and_evaluate(model, original_language.bcp_47, i)
|
224 |
for i in range(n_sentences)
|
225 |
for original_language in languages.itertuples()
|
@@ -230,22 +289,41 @@ async def main():
|
|
230 |
or original_language.bcp_47 in detailed_languages.bcp_47.values
|
231 |
)
|
232 |
]
|
233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
234 |
results = []
|
235 |
for language in languages.itertuples():
|
236 |
results_for_language = []
|
237 |
for model in models:
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
239 |
score
|
240 |
-
for score in
|
241 |
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
242 |
]
|
243 |
-
|
|
|
244 |
results_for_language.append(
|
245 |
{
|
246 |
"model": model,
|
247 |
-
"bleu": mean([s["bleu"] for s in
|
248 |
-
"chrf": mean([s["chrf"] for s in
|
|
|
249 |
}
|
250 |
)
|
251 |
if results_for_language:
|
@@ -257,6 +335,7 @@ async def main():
|
|
257 |
"scores": results_for_language,
|
258 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
259 |
"chrf": mean([s["chrf"] for s in results_for_language]),
|
|
|
260 |
"commonvoice_hours": language.commonvoice_hours
|
261 |
if not pd.isna(language.commonvoice_hours)
|
262 |
else None,
|
|
|
15 |
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
16 |
from openai import AsyncOpenAI
|
17 |
from requests import get
|
18 |
+
from rich import print
|
19 |
from tqdm.asyncio import tqdm_asyncio
|
20 |
from transformers import NllbTokenizer
|
21 |
|
22 |
# config
|
23 |
models = [
|
24 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
25 |
+
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
26 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
27 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
28 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
|
|
139 |
) # "left" because keep it simple for now
|
140 |
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
141 |
|
142 |
+
languages = languages.sort_values(by="speakers", ascending=False).iloc[:10]
|
143 |
|
144 |
# sample languages to translate to
|
145 |
target_languages = languages[languages["in_benchmark"]].sample(
|
146 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
147 |
)
|
148 |
# sample languages to analyze with all models
|
149 |
+
detailed_languages = languages[languages["in_benchmark"]].sample(n=1, random_state=42)
|
150 |
|
151 |
|
152 |
# utils
|
|
|
214 |
}
|
215 |
|
216 |
|
217 |
+
metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
218 |
+
|
219 |
+
@cache
|
220 |
+
async def classify_and_evaluate(model, language_bcp_47, nr):
|
221 |
+
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
222 |
+
sentences = pd.DataFrame(load_sentences(language), columns=["text"])
|
223 |
+
sentences = pd.concat([metadata, sentences], axis=1)
|
224 |
+
sentences = sentences.dropna(subset=["topic"])
|
225 |
+
sentences["topic"] = sentences["topic"].str.lower()
|
226 |
+
paragraphs = (
|
227 |
+
sentences.groupby("URL").agg({"text": " ".join, "topic": "first"}).reset_index()
|
228 |
+
)
|
229 |
+
top_topics = paragraphs.value_counts("topic").head(5).index
|
230 |
+
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
231 |
+
examples = pd.concat(
|
232 |
+
[
|
233 |
+
paragraphs[paragraphs["topic"] == t].sample(n=5, random_state=42)
|
234 |
+
for t in top_topics
|
235 |
+
]
|
236 |
+
).sample(frac=1, random_state=42)
|
237 |
+
test_paragraphs = paragraphs[~paragraphs["URL"].isin(examples["URL"])].sample(
|
238 |
+
frac=1, random_state=42
|
239 |
+
)
|
240 |
+
test_paragraph = test_paragraphs.iloc[nr]
|
241 |
+
messages = [
|
242 |
+
{
|
243 |
+
"role": "system",
|
244 |
+
"content": f"Categories: {'; '.join(examples['topic'].drop_duplicates())}.",
|
245 |
+
}
|
246 |
+
]
|
247 |
+
for example in examples.itertuples():
|
248 |
+
messages += [
|
249 |
+
{"role": "user", "content": example.text},
|
250 |
+
{"role": "assistant", "content": example.topic},
|
251 |
+
]
|
252 |
+
reply = await complete(
|
253 |
+
model=model,
|
254 |
+
messages=[
|
255 |
+
*messages,
|
256 |
+
{
|
257 |
+
"role": "user",
|
258 |
+
"content": test_paragraph.text,
|
259 |
+
},
|
260 |
+
],
|
261 |
+
temperature=0,
|
262 |
+
max_tokens=1024,
|
263 |
+
)
|
264 |
+
prediction = reply.choices[0].message.content.strip()
|
265 |
+
return {
|
266 |
+
"model": model,
|
267 |
+
"bcp_47": language["bcp_47"],
|
268 |
+
"true": test_paragraph.topic,
|
269 |
+
"pred": prediction,
|
270 |
+
"sentence_nr": nr,
|
271 |
+
}
|
272 |
+
|
273 |
+
|
274 |
def mean(lst):
|
275 |
return sum(lst) / len(lst) if lst else 0
|
276 |
|
277 |
|
278 |
# evaluation!
|
279 |
async def main():
|
280 |
+
print("evaluate translation")
|
281 |
+
translation_scores = [
|
282 |
translate_and_evaluate(model, original_language.bcp_47, i)
|
283 |
for i in range(n_sentences)
|
284 |
for original_language in languages.itertuples()
|
|
|
289 |
or original_language.bcp_47 in detailed_languages.bcp_47.values
|
290 |
)
|
291 |
]
|
292 |
+
translation_scores = await tqdm_asyncio.gather(*translation_scores, miniters=1)
|
293 |
+
print("evaluate classification")
|
294 |
+
classification_scores = [
|
295 |
+
classify_and_evaluate(model, language.bcp_47, i)
|
296 |
+
for i in range(n_sentences)
|
297 |
+
for language in languages.itertuples()
|
298 |
+
for model in models
|
299 |
+
if language.in_benchmark
|
300 |
+
and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
|
301 |
+
]
|
302 |
+
classification_scores = await tqdm_asyncio.gather(
|
303 |
+
*classification_scores, miniters=1
|
304 |
+
)
|
305 |
results = []
|
306 |
for language in languages.itertuples():
|
307 |
results_for_language = []
|
308 |
for model in models:
|
309 |
+
translations_for_model = [
|
310 |
+
score
|
311 |
+
for score in translation_scores
|
312 |
+
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
313 |
+
]
|
314 |
+
classifications_for_model = [
|
315 |
score
|
316 |
+
for score in classification_scores
|
317 |
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
318 |
]
|
319 |
+
accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
|
320 |
+
if translations_for_model:
|
321 |
results_for_language.append(
|
322 |
{
|
323 |
"model": model,
|
324 |
+
"bleu": mean([s["bleu"] for s in translations_for_model]),
|
325 |
+
"chrf": mean([s["chrf"] for s in translations_for_model]),
|
326 |
+
"accuracy": accuracy,
|
327 |
}
|
328 |
)
|
329 |
if results_for_language:
|
|
|
335 |
"scores": results_for_language,
|
336 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
337 |
"chrf": mean([s["chrf"] for s in results_for_language]),
|
338 |
+
"accuracy": mean([s["accuracy"] for s in results_for_language]),
|
339 |
"commonvoice_hours": language.commonvoice_hours
|
340 |
if not pd.isna(language.commonvoice_hours)
|
341 |
else None,
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|