David Pomerenke
commited on
Commit
·
8f4448c
1
Parent(s):
e223525
Display ASR-WER in app
Browse files
app.py
CHANGED
@@ -60,6 +60,17 @@ METRICS = {
|
|
60 |
between predicted and actual text. Higher scores indicate better language understanding.
|
61 |
""",
|
62 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
}
|
64 |
|
65 |
|
@@ -195,6 +206,8 @@ def create_model_comparison_plot(metric):
|
|
195 |
for lang in top_languages:
|
196 |
for score in lang["scores"]:
|
197 |
# Get the value directly using the field name
|
|
|
|
|
198 |
value = score[metric["field_name"]]
|
199 |
if value is not None:
|
200 |
scores_flat.append(
|
@@ -254,15 +267,18 @@ def create_language_stats_df(metric):
|
|
254 |
"Overall": round(lang["overall_score"], 3)
|
255 |
if lang["overall_score"] is not None
|
256 |
else "N/A",
|
257 |
-
"
|
258 |
if lang["mt_bleu"] is not None
|
259 |
else "N/A",
|
260 |
-
"
|
261 |
if lang["cls_acc"] is not None
|
262 |
else "N/A",
|
263 |
"MLM": round(lang["mlm_chrf"], 3)
|
264 |
if lang["mlm_chrf"] is not None
|
265 |
else "N/A",
|
|
|
|
|
|
|
266 |
"Best Model": model_link,
|
267 |
"CommonVoice Hours": commonvoice_link,
|
268 |
}
|
@@ -299,7 +315,7 @@ def create_scatter_plot(metric):
|
|
299 |
scores = [
|
300 |
score[metric["field_name"]]
|
301 |
for score in lang["scores"]
|
302 |
-
if score[metric["field_name"]] is not None
|
303 |
]
|
304 |
if scores: # Only include if we have valid scores
|
305 |
avg_score = sum(scores) / len(scores)
|
|
|
60 |
between predicted and actual text. Higher scores indicate better language understanding.
|
61 |
""",
|
62 |
},
|
63 |
+
"asr_wer": {
|
64 |
+
"display_name": "Automatic Speech Recognition (WER)",
|
65 |
+
"field_name": "asr_wer",
|
66 |
+
"label": "WER",
|
67 |
+
"explanation": """
|
68 |
+
**Automatic Speech Recognition Word Error Rate**: Measures the accuracy of speech-to-text transcription.
|
69 |
+
It calculates the minimum number of word edits (insertions, deletions, substitutions) needed to transform the
|
70 |
+
transcription into the reference text, divided by the number of words in the reference.
|
71 |
+
Lower scores indicate better performance, with 0 being perfect transcription.
|
72 |
+
""",
|
73 |
+
},
|
74 |
}
|
75 |
|
76 |
|
|
|
206 |
for lang in top_languages:
|
207 |
for score in lang["scores"]:
|
208 |
# Get the value directly using the field name
|
209 |
+
if metric["field_name"] not in score:
|
210 |
+
continue
|
211 |
value = score[metric["field_name"]]
|
212 |
if value is not None:
|
213 |
scores_flat.append(
|
|
|
267 |
"Overall": round(lang["overall_score"], 3)
|
268 |
if lang["overall_score"] is not None
|
269 |
else "N/A",
|
270 |
+
"Translation": round(lang["mt_bleu"], 3)
|
271 |
if lang["mt_bleu"] is not None
|
272 |
else "N/A",
|
273 |
+
"Classification": round(lang["cls_acc"], 3)
|
274 |
if lang["cls_acc"] is not None
|
275 |
else "N/A",
|
276 |
"MLM": round(lang["mlm_chrf"], 3)
|
277 |
if lang["mlm_chrf"] is not None
|
278 |
else "N/A",
|
279 |
+
"ASR": round(lang["asr_wer"], 3)
|
280 |
+
if lang["asr_wer"] is not None
|
281 |
+
else "N/A",
|
282 |
"Best Model": model_link,
|
283 |
"CommonVoice Hours": commonvoice_link,
|
284 |
}
|
|
|
315 |
scores = [
|
316 |
score[metric["field_name"]]
|
317 |
for score in lang["scores"]
|
318 |
+
if metric["field_name"] in score and score[metric["field_name"]] is not None
|
319 |
]
|
320 |
if scores: # Only include if we have valid scores
|
321 |
avg_score = sum(scores) / len(scores)
|