David Pomerenke commited on
Commit
7fc657e
·
1 Parent(s): 0c05388

Classification evaluation

Browse files
Files changed (3) hide show
  1. app.py +3 -0
  2. evals.py +89 -10
  3. results.json +0 -0
app.py CHANGED
@@ -187,6 +187,9 @@ def create_language_stats_df(results):
187
  if best_score["bleu"] is not None
188
  else "N/A",
189
  "CommonVoice Hours": commonvoice_link,
 
 
 
190
  }
191
  flat_data.append(row)
192
 
 
187
  if best_score["bleu"] is not None
188
  else "N/A",
189
  "CommonVoice Hours": commonvoice_link,
190
+ "Accuracy": round(lang["accuracy"], 3)
191
+ if lang["accuracy"] is not None
192
+ else "N/A",
193
  }
194
  flat_data.append(row)
195
 
evals.py CHANGED
@@ -15,13 +15,14 @@ from langcodes import Language, standardize_tag
15
  from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
16
  from openai import AsyncOpenAI
17
  from requests import get
 
18
  from tqdm.asyncio import tqdm_asyncio
19
  from transformers import NllbTokenizer
20
 
21
  # config
22
  models = [
23
  "openai/gpt-4o-mini", # 0.6$/M tokens
24
- # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
25
  "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
26
  "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
27
  "google/gemini-2.0-flash-001", # 0.4$/M tokens
@@ -138,14 +139,14 @@ languages = pd.merge(
138
  ) # "left" because keep it simple for now
139
  languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
140
 
141
- languages = languages.sort_values(by="speakers", ascending=False)
142
 
143
  # sample languages to translate to
144
  target_languages = languages[languages["in_benchmark"]].sample(
145
  n=n_sentences, weights="speakers", replace=True, random_state=42
146
  )
147
  # sample languages to analyze with all models
148
- detailed_languages = languages[languages["in_benchmark"]].sample(n=30, random_state=42)
149
 
150
 
151
  # utils
@@ -213,13 +214,71 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
213
  }
214
 
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  def mean(lst):
217
  return sum(lst) / len(lst) if lst else 0
218
 
219
 
220
  # evaluation!
221
  async def main():
222
- scores = [
 
223
  translate_and_evaluate(model, original_language.bcp_47, i)
224
  for i in range(n_sentences)
225
  for original_language in languages.itertuples()
@@ -230,22 +289,41 @@ async def main():
230
  or original_language.bcp_47 in detailed_languages.bcp_47.values
231
  )
232
  ]
233
- scores = await tqdm_asyncio.gather(*scores, miniters=1)
 
 
 
 
 
 
 
 
 
 
 
 
234
  results = []
235
  for language in languages.itertuples():
236
  results_for_language = []
237
  for model in models:
238
- results_for_model = [
 
 
 
 
 
239
  score
240
- for score in scores
241
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
242
  ]
243
- if results_for_model:
 
244
  results_for_language.append(
245
  {
246
  "model": model,
247
- "bleu": mean([s["bleu"] for s in results_for_model]),
248
- "chrf": mean([s["chrf"] for s in results_for_model]),
 
249
  }
250
  )
251
  if results_for_language:
@@ -257,6 +335,7 @@ async def main():
257
  "scores": results_for_language,
258
  "bleu": mean([s["bleu"] for s in results_for_language]),
259
  "chrf": mean([s["chrf"] for s in results_for_language]),
 
260
  "commonvoice_hours": language.commonvoice_hours
261
  if not pd.isna(language.commonvoice_hours)
262
  else None,
 
15
  from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
16
  from openai import AsyncOpenAI
17
  from requests import get
18
+ from rich import print
19
  from tqdm.asyncio import tqdm_asyncio
20
  from transformers import NllbTokenizer
21
 
22
  # config
23
  models = [
24
  "openai/gpt-4o-mini", # 0.6$/M tokens
25
+ # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
26
  "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
27
  "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
28
  "google/gemini-2.0-flash-001", # 0.4$/M tokens
 
139
  ) # "left" because keep it simple for now
140
  languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
141
 
142
+ languages = languages.sort_values(by="speakers", ascending=False).iloc[:10]
143
 
144
  # sample languages to translate to
145
  target_languages = languages[languages["in_benchmark"]].sample(
146
  n=n_sentences, weights="speakers", replace=True, random_state=42
147
  )
148
  # sample languages to analyze with all models
149
+ detailed_languages = languages[languages["in_benchmark"]].sample(n=1, random_state=42)
150
 
151
 
152
  # utils
 
214
  }
215
 
216
 
217
+ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
218
+
219
+ @cache
220
+ async def classify_and_evaluate(model, language_bcp_47, nr):
221
+ language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
222
+ sentences = pd.DataFrame(load_sentences(language), columns=["text"])
223
+ sentences = pd.concat([metadata, sentences], axis=1)
224
+ sentences = sentences.dropna(subset=["topic"])
225
+ sentences["topic"] = sentences["topic"].str.lower()
226
+ paragraphs = (
227
+ sentences.groupby("URL").agg({"text": " ".join, "topic": "first"}).reset_index()
228
+ )
229
+ top_topics = paragraphs.value_counts("topic").head(5).index
230
+ paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
231
+ examples = pd.concat(
232
+ [
233
+ paragraphs[paragraphs["topic"] == t].sample(n=5, random_state=42)
234
+ for t in top_topics
235
+ ]
236
+ ).sample(frac=1, random_state=42)
237
+ test_paragraphs = paragraphs[~paragraphs["URL"].isin(examples["URL"])].sample(
238
+ frac=1, random_state=42
239
+ )
240
+ test_paragraph = test_paragraphs.iloc[nr]
241
+ messages = [
242
+ {
243
+ "role": "system",
244
+ "content": f"Categories: {'; '.join(examples['topic'].drop_duplicates())}.",
245
+ }
246
+ ]
247
+ for example in examples.itertuples():
248
+ messages += [
249
+ {"role": "user", "content": example.text},
250
+ {"role": "assistant", "content": example.topic},
251
+ ]
252
+ reply = await complete(
253
+ model=model,
254
+ messages=[
255
+ *messages,
256
+ {
257
+ "role": "user",
258
+ "content": test_paragraph.text,
259
+ },
260
+ ],
261
+ temperature=0,
262
+ max_tokens=1024,
263
+ )
264
+ prediction = reply.choices[0].message.content.strip()
265
+ return {
266
+ "model": model,
267
+ "bcp_47": language["bcp_47"],
268
+ "true": test_paragraph.topic,
269
+ "pred": prediction,
270
+ "sentence_nr": nr,
271
+ }
272
+
273
+
274
  def mean(lst):
275
  return sum(lst) / len(lst) if lst else 0
276
 
277
 
278
  # evaluation!
279
  async def main():
280
+ print("evaluate translation")
281
+ translation_scores = [
282
  translate_and_evaluate(model, original_language.bcp_47, i)
283
  for i in range(n_sentences)
284
  for original_language in languages.itertuples()
 
289
  or original_language.bcp_47 in detailed_languages.bcp_47.values
290
  )
291
  ]
292
+ translation_scores = await tqdm_asyncio.gather(*translation_scores, miniters=1)
293
+ print("evaluate classification")
294
+ classification_scores = [
295
+ classify_and_evaluate(model, language.bcp_47, i)
296
+ for i in range(n_sentences)
297
+ for language in languages.itertuples()
298
+ for model in models
299
+ if language.in_benchmark
300
+ and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
301
+ ]
302
+ classification_scores = await tqdm_asyncio.gather(
303
+ *classification_scores, miniters=1
304
+ )
305
  results = []
306
  for language in languages.itertuples():
307
  results_for_language = []
308
  for model in models:
309
+ translations_for_model = [
310
+ score
311
+ for score in translation_scores
312
+ if score["bcp_47"] == language.bcp_47 and score["model"] == model
313
+ ]
314
+ classifications_for_model = [
315
  score
316
+ for score in classification_scores
317
  if score["bcp_47"] == language.bcp_47 and score["model"] == model
318
  ]
319
+ accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
320
+ if translations_for_model:
321
  results_for_language.append(
322
  {
323
  "model": model,
324
+ "bleu": mean([s["bleu"] for s in translations_for_model]),
325
+ "chrf": mean([s["chrf"] for s in translations_for_model]),
326
+ "accuracy": accuracy,
327
  }
328
  )
329
  if results_for_language:
 
335
  "scores": results_for_language,
336
  "bleu": mean([s["bleu"] for s in results_for_language]),
337
  "chrf": mean([s["chrf"] for s in results_for_language]),
338
+ "accuracy": mean([s["accuracy"] for s in results_for_language]),
339
  "commonvoice_hours": language.commonvoice_hours
340
  if not pd.isna(language.commonvoice_hours)
341
  else None,
results.json CHANGED
The diff for this file is too large to render. See raw diff