David Pomerenke
commited on
Commit
·
731eddd
1
Parent(s):
60d1364
Translation both from and to
Browse files- evals/backend.py +1 -1
- evals/models.py +1 -2
- evals/tasks.py +26 -16
- frontend/src/components/LanguageTable.js +15 -3
- frontend/src/components/ModelTable.js +16 -4
- results.json +0 -0
evals/backend.py
CHANGED
@@ -22,7 +22,7 @@ def mean(lst):
|
|
22 |
return sum(lst) / len(lst) if lst else None
|
23 |
|
24 |
|
25 |
-
task_metrics = ["
|
26 |
|
27 |
|
28 |
def make_model_table(df, models):
|
|
|
22 |
return sum(lst) / len(lst) if lst else None
|
23 |
|
24 |
|
25 |
+
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
26 |
|
27 |
|
28 |
def make_model_table(df, models):
|
evals/models.py
CHANGED
@@ -116,7 +116,7 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
|
|
116 |
raise ValueError(f"Model {model} not supported")
|
117 |
|
118 |
|
119 |
-
models = pd.DataFrame(models, columns=["id"])
|
120 |
|
121 |
|
122 |
@cache
|
@@ -144,7 +144,6 @@ def get_hf_metadata(row):
|
|
144 |
if not row:
|
145 |
return empty
|
146 |
id = row["hf_slug"] or row["slug"].split(":")[0]
|
147 |
-
print(id)
|
148 |
if not id:
|
149 |
return empty
|
150 |
try:
|
|
|
116 |
raise ValueError(f"Model {model} not supported")
|
117 |
|
118 |
|
119 |
+
models = pd.DataFrame(models, columns=["id"]).iloc[:3]
|
120 |
|
121 |
|
122 |
@cache
|
|
|
144 |
if not row:
|
145 |
return empty
|
146 |
id = row["hf_slug"] or row["slug"].split(":")[0]
|
|
|
147 |
if not id:
|
148 |
return empty
|
149 |
try:
|
evals/tasks.py
CHANGED
@@ -1,30 +1,37 @@
|
|
1 |
import random
|
|
|
2 |
|
3 |
import evaluate
|
4 |
import pandas as pd
|
|
|
|
|
5 |
from joblib.memory import Memory
|
6 |
from languages import languages, script_name
|
7 |
-
from datasets_.flores import flores_sentences
|
8 |
from models import complete, transcribe
|
9 |
-
import sentencepiece as spm
|
10 |
|
11 |
cache = Memory(location=".cache", verbose=0).cache
|
12 |
bleu = evaluate.load("bleu")
|
13 |
chrf = evaluate.load("chrf")
|
14 |
wer = evaluate.load("wer")
|
15 |
-
tokenizer = spm.SentencePieceProcessor(
|
|
|
|
|
16 |
|
17 |
# sample languages to translate to
|
18 |
target_languages = languages[languages["in_benchmark"]].sample(
|
19 |
frac=1, weights="speakers", replace=True, random_state=42
|
20 |
)
|
21 |
|
|
|
22 |
@cache
|
23 |
-
async def translate_and_evaluate(model,
|
24 |
-
original_language = languages[languages["bcp_47"] ==
|
25 |
-
0
|
26 |
-
]
|
27 |
target_language = target_languages.iloc[sentence_nr]
|
|
|
|
|
|
|
|
|
|
|
28 |
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
29 |
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
30 |
script = script_name(target_language.flores_path.split("_")[1])
|
@@ -52,14 +59,15 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
52 |
return [
|
53 |
{
|
54 |
"model": model,
|
55 |
-
"bcp_47":
|
56 |
-
"task": "
|
57 |
"metric": metric,
|
58 |
"score": score,
|
59 |
"sentence_nr": sentence_nr,
|
60 |
}
|
61 |
-
for metric, score in
|
62 |
-
|
|
|
63 |
)
|
64 |
]
|
65 |
|
@@ -68,8 +76,8 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
|
68 |
|
69 |
|
70 |
@cache
|
71 |
-
async def classify_and_evaluate(model,
|
72 |
-
language = languages[languages["bcp_47"] ==
|
73 |
sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
|
74 |
sentences = pd.concat([metadata, sentences], axis=1)
|
75 |
sentences = sentences.dropna(subset=["topic"])
|
@@ -119,7 +127,7 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
|
|
119 |
return [
|
120 |
{
|
121 |
"model": model,
|
122 |
-
"bcp_47":
|
123 |
"task": "classification",
|
124 |
"metric": "accuracy",
|
125 |
"score": int(pred == true),
|
@@ -177,6 +185,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
177 |
}
|
178 |
]
|
179 |
|
|
|
180 |
@cache
|
181 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
182 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
@@ -210,8 +219,9 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
210 |
|
211 |
|
212 |
tasks = [
|
213 |
-
translate_and_evaluate,
|
|
|
214 |
classify_and_evaluate,
|
215 |
# mlm_and_evaluate,
|
216 |
# transcribe_and_evaluate,
|
217 |
-
]
|
|
|
1 |
import random
|
2 |
+
from functools import partial
|
3 |
|
4 |
import evaluate
|
5 |
import pandas as pd
|
6 |
+
import sentencepiece as spm
|
7 |
+
from datasets_.flores import flores_sentences
|
8 |
from joblib.memory import Memory
|
9 |
from languages import languages, script_name
|
|
|
10 |
from models import complete, transcribe
|
|
|
11 |
|
12 |
cache = Memory(location=".cache", verbose=0).cache
|
13 |
bleu = evaluate.load("bleu")
|
14 |
chrf = evaluate.load("chrf")
|
15 |
wer = evaluate.load("wer")
|
16 |
+
tokenizer = spm.SentencePieceProcessor(
|
17 |
+
model_file="data/spbleu/flores200_sacrebleu_tokenizer_spm.model"
|
18 |
+
)
|
19 |
|
20 |
# sample languages to translate to
|
21 |
target_languages = languages[languages["in_benchmark"]].sample(
|
22 |
frac=1, weights="speakers", replace=True, random_state=42
|
23 |
)
|
24 |
|
25 |
+
|
26 |
@cache
|
27 |
+
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
28 |
+
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
|
29 |
target_language = target_languages.iloc[sentence_nr]
|
30 |
+
match mode:
|
31 |
+
case "from":
|
32 |
+
pass
|
33 |
+
case "to":
|
34 |
+
original_language, target_language = target_language, original_language
|
35 |
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
36 |
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
37 |
script = script_name(target_language.flores_path.split("_")[1])
|
|
|
59 |
return [
|
60 |
{
|
61 |
"model": model,
|
62 |
+
"bcp_47": bcp_47,
|
63 |
+
"task": f"translation_{mode}",
|
64 |
"metric": metric,
|
65 |
"score": score,
|
66 |
"sentence_nr": sentence_nr,
|
67 |
}
|
68 |
+
for metric, score in (
|
69 |
+
("bleu", bleu_score["bleu"]),
|
70 |
+
("chrf", chrf_score["score"] / 100),
|
71 |
)
|
72 |
]
|
73 |
|
|
|
76 |
|
77 |
|
78 |
@cache
|
79 |
+
async def classify_and_evaluate(model, bcp_47, nr):
|
80 |
+
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
81 |
sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
|
82 |
sentences = pd.concat([metadata, sentences], axis=1)
|
83 |
sentences = sentences.dropna(subset=["topic"])
|
|
|
127 |
return [
|
128 |
{
|
129 |
"model": model,
|
130 |
+
"bcp_47": bcp_47,
|
131 |
"task": "classification",
|
132 |
"metric": "accuracy",
|
133 |
"score": int(pred == true),
|
|
|
185 |
}
|
186 |
]
|
187 |
|
188 |
+
|
189 |
@cache
|
190 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
191 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
|
|
219 |
|
220 |
|
221 |
tasks = [
|
222 |
+
partial(translate_and_evaluate, mode="from"),
|
223 |
+
partial(translate_and_evaluate, mode="to"),
|
224 |
classify_and_evaluate,
|
225 |
# mlm_and_evaluate,
|
226 |
# transcribe_and_evaluate,
|
227 |
+
]
|
frontend/src/components/LanguageTable.js
CHANGED
@@ -174,10 +174,22 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
175 |
/>
|
176 |
<Column
|
177 |
-
field='
|
178 |
-
header=
|
|
|
179 |
sortable
|
180 |
-
body={scoreBodyTemplate('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
minScore: 0,
|
182 |
maxScore: 0.5
|
183 |
})}
|
|
|
174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
175 |
/>
|
176 |
<Column
|
177 |
+
field='translation_from_bleu'
|
178 |
+
header="Translation (from)"
|
179 |
+
headerTooltip='Translation performance from a language to all other languages (spBLEU score)'
|
180 |
sortable
|
181 |
+
body={scoreBodyTemplate('translation_from_bleu', {
|
182 |
+
minScore: 0,
|
183 |
+
maxScore: 0.5
|
184 |
+
})}
|
185 |
+
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
186 |
+
/>
|
187 |
+
<Column
|
188 |
+
field='translation_to_bleu'
|
189 |
+
header="Translation (to)"
|
190 |
+
headerTooltip='Translation performance from all other languages to a language (spBLEU score)'
|
191 |
+
sortable
|
192 |
+
body={scoreBodyTemplate('translation_to_bleu', {
|
193 |
minScore: 0,
|
194 |
maxScore: 0.5
|
195 |
})}
|
frontend/src/components/ModelTable.js
CHANGED
@@ -224,12 +224,24 @@ const ModelTable = ({ data }) => {
|
|
224 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
225 |
/>
|
226 |
<Column
|
227 |
-
field='
|
228 |
-
header=
|
|
|
229 |
sortable
|
230 |
-
body={scoreBodyTemplate('
|
231 |
minScore: 0,
|
232 |
-
maxScore: 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
})}
|
234 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
235 |
/>
|
|
|
224 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
225 |
/>
|
226 |
<Column
|
227 |
+
field='translation_from_bleu'
|
228 |
+
header="Translation (from)"
|
229 |
+
headerTooltip='Translation performance from a language to all other languages (spBLEU score)'
|
230 |
sortable
|
231 |
+
body={scoreBodyTemplate('translation_from_bleu', {
|
232 |
minScore: 0,
|
233 |
+
maxScore: 0.5
|
234 |
+
})}
|
235 |
+
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
236 |
+
/>
|
237 |
+
<Column
|
238 |
+
field='translation_to_bleu'
|
239 |
+
header="Translation (to)"
|
240 |
+
headerTooltip='Translation performance from all other languages to a language (spBLEU score)'
|
241 |
+
sortable
|
242 |
+
body={scoreBodyTemplate('translation_to_bleu', {
|
243 |
+
minScore: 0,
|
244 |
+
maxScore: 0.5
|
245 |
})}
|
246 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
247 |
/>
|
results.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|