David Pomerenke commited on
Commit
731eddd
·
1 Parent(s): 60d1364

Translation both from and to

Browse files
evals/backend.py CHANGED
@@ -22,7 +22,7 @@ def mean(lst):
22
  return sum(lst) / len(lst) if lst else None
23
 
24
 
25
- task_metrics = ["translation_bleu", "classification_accuracy"]
26
 
27
 
28
  def make_model_table(df, models):
 
22
  return sum(lst) / len(lst) if lst else None
23
 
24
 
25
+ task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
26
 
27
 
28
  def make_model_table(df, models):
evals/models.py CHANGED
@@ -116,7 +116,7 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
116
  raise ValueError(f"Model {model} not supported")
117
 
118
 
119
- models = pd.DataFrame(models, columns=["id"])
120
 
121
 
122
  @cache
@@ -144,7 +144,6 @@ def get_hf_metadata(row):
144
  if not row:
145
  return empty
146
  id = row["hf_slug"] or row["slug"].split(":")[0]
147
- print(id)
148
  if not id:
149
  return empty
150
  try:
 
116
  raise ValueError(f"Model {model} not supported")
117
 
118
 
119
+ models = pd.DataFrame(models, columns=["id"]).iloc[:3]
120
 
121
 
122
  @cache
 
144
  if not row:
145
  return empty
146
  id = row["hf_slug"] or row["slug"].split(":")[0]
 
147
  if not id:
148
  return empty
149
  try:
evals/tasks.py CHANGED
@@ -1,30 +1,37 @@
1
  import random
 
2
 
3
  import evaluate
4
  import pandas as pd
 
 
5
  from joblib.memory import Memory
6
  from languages import languages, script_name
7
- from datasets_.flores import flores_sentences
8
  from models import complete, transcribe
9
- import sentencepiece as spm
10
 
11
  cache = Memory(location=".cache", verbose=0).cache
12
  bleu = evaluate.load("bleu")
13
  chrf = evaluate.load("chrf")
14
  wer = evaluate.load("wer")
15
- tokenizer = spm.SentencePieceProcessor(model_file="data/spbleu/flores200_sacrebleu_tokenizer_spm.model")
 
 
16
 
17
  # sample languages to translate to
18
  target_languages = languages[languages["in_benchmark"]].sample(
19
  frac=1, weights="speakers", replace=True, random_state=42
20
  )
21
 
 
22
  @cache
23
- async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
24
- original_language = languages[languages["bcp_47"] == original_language_bcp_47].iloc[
25
- 0
26
- ]
27
  target_language = target_languages.iloc[sentence_nr]
 
 
 
 
 
28
  original_sentence = flores_sentences(original_language)[sentence_nr].strip()
29
  target_sentence = flores_sentences(target_language)[sentence_nr].strip()
30
  script = script_name(target_language.flores_path.split("_")[1])
@@ -52,14 +59,15 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
52
  return [
53
  {
54
  "model": model,
55
- "bcp_47": original_language["bcp_47"],
56
- "task": "translation",
57
  "metric": metric,
58
  "score": score,
59
  "sentence_nr": sentence_nr,
60
  }
61
- for metric, score in zip(
62
- ["bleu", "chrf"], [bleu_score["bleu"], chrf_score["score"] / 100]
 
63
  )
64
  ]
65
 
@@ -68,8 +76,8 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
68
 
69
 
70
  @cache
71
- async def classify_and_evaluate(model, language_bcp_47, nr):
72
- language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
73
  sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
74
  sentences = pd.concat([metadata, sentences], axis=1)
75
  sentences = sentences.dropna(subset=["topic"])
@@ -119,7 +127,7 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
119
  return [
120
  {
121
  "model": model,
122
- "bcp_47": language["bcp_47"],
123
  "task": "classification",
124
  "metric": "accuracy",
125
  "score": int(pred == true),
@@ -177,6 +185,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
177
  }
178
  ]
179
 
 
180
  @cache
181
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
182
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
@@ -210,8 +219,9 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
210
 
211
 
212
  tasks = [
213
- translate_and_evaluate,
 
214
  classify_and_evaluate,
215
  # mlm_and_evaluate,
216
  # transcribe_and_evaluate,
217
- ]
 
1
  import random
2
+ from functools import partial
3
 
4
  import evaluate
5
  import pandas as pd
6
+ import sentencepiece as spm
7
+ from datasets_.flores import flores_sentences
8
  from joblib.memory import Memory
9
  from languages import languages, script_name
 
10
  from models import complete, transcribe
 
11
 
12
  cache = Memory(location=".cache", verbose=0).cache
13
  bleu = evaluate.load("bleu")
14
  chrf = evaluate.load("chrf")
15
  wer = evaluate.load("wer")
16
+ tokenizer = spm.SentencePieceProcessor(
17
+ model_file="data/spbleu/flores200_sacrebleu_tokenizer_spm.model"
18
+ )
19
 
20
  # sample languages to translate to
21
  target_languages = languages[languages["in_benchmark"]].sample(
22
  frac=1, weights="speakers", replace=True, random_state=42
23
  )
24
 
25
+
26
  @cache
27
+ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
28
+ original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
 
 
29
  target_language = target_languages.iloc[sentence_nr]
30
+ match mode:
31
+ case "from":
32
+ pass
33
+ case "to":
34
+ original_language, target_language = target_language, original_language
35
  original_sentence = flores_sentences(original_language)[sentence_nr].strip()
36
  target_sentence = flores_sentences(target_language)[sentence_nr].strip()
37
  script = script_name(target_language.flores_path.split("_")[1])
 
59
  return [
60
  {
61
  "model": model,
62
+ "bcp_47": bcp_47,
63
+ "task": f"translation_{mode}",
64
  "metric": metric,
65
  "score": score,
66
  "sentence_nr": sentence_nr,
67
  }
68
+ for metric, score in (
69
+ ("bleu", bleu_score["bleu"]),
70
+ ("chrf", chrf_score["score"] / 100),
71
  )
72
  ]
73
 
 
76
 
77
 
78
  @cache
79
+ async def classify_and_evaluate(model, bcp_47, nr):
80
+ language = languages[languages["bcp_47"] == bcp_47].iloc[0]
81
  sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
82
  sentences = pd.concat([metadata, sentences], axis=1)
83
  sentences = sentences.dropna(subset=["topic"])
 
127
  return [
128
  {
129
  "model": model,
130
+ "bcp_47": bcp_47,
131
  "task": "classification",
132
  "metric": "accuracy",
133
  "score": int(pred == true),
 
185
  }
186
  ]
187
 
188
+
189
  @cache
190
  async def transcribe_and_evaluate(model, language_bcp_47, nr):
191
  language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
 
219
 
220
 
221
  tasks = [
222
+ partial(translate_and_evaluate, mode="from"),
223
+ partial(translate_and_evaluate, mode="to"),
224
  classify_and_evaluate,
225
  # mlm_and_evaluate,
226
  # transcribe_and_evaluate,
227
+ ]
frontend/src/components/LanguageTable.js CHANGED
@@ -174,10 +174,22 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
174
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
175
  />
176
  <Column
177
- field='translation_bleu'
178
- header='Translation'
 
179
  sortable
180
- body={scoreBodyTemplate('translation_bleu', {
 
 
 
 
 
 
 
 
 
 
 
181
  minScore: 0,
182
  maxScore: 0.5
183
  })}
 
174
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
175
  />
176
  <Column
177
+ field='translation_from_bleu'
178
+ header="Translation (from)"
179
+ headerTooltip='Translation performance from a language to all other languages (spBLEU score)'
180
  sortable
181
+ body={scoreBodyTemplate('translation_from_bleu', {
182
+ minScore: 0,
183
+ maxScore: 0.5
184
+ })}
185
+ style={{ minWidth: '5rem', maxWidth: '10rem' }}
186
+ />
187
+ <Column
188
+ field='translation_to_bleu'
189
+ header="Translation (to)"
190
+ headerTooltip='Translation performance from all other languages to a language (spBLEU score)'
191
+ sortable
192
+ body={scoreBodyTemplate('translation_to_bleu', {
193
  minScore: 0,
194
  maxScore: 0.5
195
  })}
frontend/src/components/ModelTable.js CHANGED
@@ -224,12 +224,24 @@ const ModelTable = ({ data }) => {
224
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
225
  />
226
  <Column
227
- field='translation_bleu'
228
- header='Translation'
 
229
  sortable
230
- body={scoreBodyTemplate('translation_bleu', {
231
  minScore: 0,
232
- maxScore: 0.3
 
 
 
 
 
 
 
 
 
 
 
233
  })}
234
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
235
  />
 
224
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
225
  />
226
  <Column
227
+ field='translation_from_bleu'
228
+ header="Translation (from)"
229
+ headerTooltip='Translation performance from a language to all other languages (spBLEU score)'
230
  sortable
231
+ body={scoreBodyTemplate('translation_from_bleu', {
232
  minScore: 0,
233
+ maxScore: 0.5
234
+ })}
235
+ style={{ minWidth: '5rem', maxWidth: '10rem' }}
236
+ />
237
+ <Column
238
+ field='translation_to_bleu'
239
+ header="Translation (to)"
240
+ headerTooltip='Translation performance from all other languages to a language (spBLEU score)'
241
+ sortable
242
+ body={scoreBodyTemplate('translation_to_bleu', {
243
+ minScore: 0,
244
+ maxScore: 0.5
245
  })}
246
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
247
  />
results.json CHANGED
The diff for this file is too large to render. See raw diff