He Yingxu commited on
Commit
2330259
·
1 Parent(s): b46797f

breakdown asr sea

Browse files
Files changed (34) hide show
  1. app.py +11 -3
  2. app/content.py +25 -10
  3. app/pages.py +85 -5
  4. process_log.py +116 -0
  5. results_organized/bleu/st.csv +2 -2
  6. results_organized/llama3_70b_judge/accent_recognition.csv +9 -9
  7. results_organized/llama3_70b_judge/sqa_english.csv +2 -0
  8. results_organized/wer/asr_english.csv +3 -3
  9. results_organized/wer/asr_indonesian.csv +13 -0
  10. results_organized/wer/asr_malay.csv +13 -0
  11. results_organized/wer/asr_mandarin.csv +18 -18
  12. results_organized/wer/asr_private.csv +13 -12
  13. results_organized/wer/asr_sea.csv +0 -12
  14. results_organized/wer/asr_singlish.csv +3 -3
  15. results_organized/wer/asr_tamil.csv +13 -0
  16. results_organized/wer/asr_thai.csv +13 -0
  17. results_organized/wer/asr_vietnamese.csv +13 -0
  18. results_organized_archive/bleu/st.csv +12 -0
  19. results_organized_archive/llama3_70b_judge/accent_recognition.csv +12 -0
  20. results_organized_archive/llama3_70b_judge/audio_captioning.csv +12 -0
  21. results_organized_archive/llama3_70b_judge/audio_scene_question_answering.csv +12 -0
  22. results_organized_archive/llama3_70b_judge/emotion_recognition.csv +12 -0
  23. results_organized_archive/llama3_70b_judge/gender_recognition.csv +12 -0
  24. results_organized_archive/llama3_70b_judge/music_understanding.csv +12 -0
  25. results_organized_archive/llama3_70b_judge/sds_singlish.csv +12 -0
  26. results_organized_archive/llama3_70b_judge/speech_instruction.csv +12 -0
  27. results_organized_archive/llama3_70b_judge/sqa_english.csv +12 -0
  28. results_organized_archive/llama3_70b_judge/sqa_singlish.csv +12 -0
  29. results_organized_archive/llama3_70b_judge/under_development_llama3_70b_judge.csv +12 -0
  30. results_organized_archive/meteor/audio_captioning.csv +12 -0
  31. results_organized_archive/wer/asr_english.csv +12 -0
  32. results_organized_archive/wer/asr_mandarin.csv +12 -0
  33. results_organized_archive/wer/asr_singlish.csv +12 -0
  34. results_organized_archive/wer/under_development_wer.csv +14 -0
app.py CHANGED
@@ -19,7 +19,11 @@ pages = {
19
  'ASR-English' : asr_english,
20
  'ASR-Mandarin' : asr_mandarin,
21
  'ASR-Singlish' : asr_singlish,
22
- 'ASR-SEA' : asr_sea,
 
 
 
 
23
  'ASR-Private' : asr_private,
24
  'Speech Translation' : speech_translation,
25
  'SQA-English' : speech_question_answering_english,
@@ -47,9 +51,13 @@ menu_items = [
47
  sac.MenuItem(label='Automatic Speech Recognition', icon='mic',
48
  children = [
49
  sac.MenuItem(label='ASR-English', icon='mic'),
50
- sac.MenuItem(label='ASR-Mandarin', icon='mic'),
51
  sac.MenuItem(label='ASR-Singlish', icon='mic'),
52
- sac.MenuItem(label='ASR-SEA', icon='mic'),
 
 
 
 
 
53
  sac.MenuItem(label='ASR-Private', icon='mic'),
54
  ]
55
  ),
 
19
  'ASR-English' : asr_english,
20
  'ASR-Mandarin' : asr_mandarin,
21
  'ASR-Singlish' : asr_singlish,
22
+ 'ASR-Malay' : asr_malay,
23
+ 'ASR-Tamil' : asr_tamil,
24
+ 'ASR-Indonesian' : asr_indonesian,
25
+ 'ASR-Thai' : asr_thai,
26
+ 'ASR-Vietnamese' : asr_vietnamese,
27
  'ASR-Private' : asr_private,
28
  'Speech Translation' : speech_translation,
29
  'SQA-English' : speech_question_answering_english,
 
51
  sac.MenuItem(label='Automatic Speech Recognition', icon='mic',
52
  children = [
53
  sac.MenuItem(label='ASR-English', icon='mic'),
 
54
  sac.MenuItem(label='ASR-Singlish', icon='mic'),
55
+ sac.MenuItem(label='ASR-Mandarin', icon='mic'),
56
+ sac.MenuItem(label='ASR-Malay', icon='mic'),
57
+ sac.MenuItem(label='ASR-Tamil', icon='mic'),
58
+ sac.MenuItem(label='ASR-Indonesian', icon='mic'),
59
+ sac.MenuItem(label='ASR-Thai', icon='mic'),
60
+ sac.MenuItem(label='ASR-Vietnamese', icon='mic'),
61
  sac.MenuItem(label='ASR-Private', icon='mic'),
62
  ]
63
  ),
app/content.py CHANGED
@@ -23,23 +23,41 @@ asr_singlish_datasets = [
23
 
24
  asr_mandarin_datasets = [
25
  'AISHELL-ASR-ZH',
26
- 'CommonVoice-ZH'
 
27
  ]
28
 
29
 
30
- asr_sea_datasets = [
31
- 'CommonVoice-17-Indonesian',
 
 
 
 
32
  'CommonVoice-17-Tamil',
33
- # 'CommonVoice-17-Thai',
34
- 'CommonVoice-17-Vietnamese',
 
 
 
 
 
35
  'GigaSpeech-2-Indonesain',
 
 
 
 
36
  'GigaSpeech-2-Thai',
37
- 'GigaSpeech-2-Vietnamese',
38
- 'Fleurs-Tamil',
39
  'Lotus-Thai'
40
  ]
41
 
42
 
 
 
 
 
 
 
43
  asr_private_datasets = [
44
  'CNA',
45
  'IDPC',
@@ -52,9 +70,6 @@ asr_private_datasets = [
52
  'Mediacorp-Short',
53
  'YouTube ASR: English Singapore Content',
54
  'YouTube ASR: English with Strong Emotion',
55
- 'YouTube ASR: Malay with English Prompt',
56
- 'YouTube ASR: Chinese with English Prompt',
57
- 'YouTube ASR: Tamil with English Prompt'
58
  ]
59
 
60
 
 
23
 
24
  asr_mandarin_datasets = [
25
  'AISHELL-ASR-ZH',
26
+ 'CommonVoice-ZH',
27
+ 'YouTube ASR: Chinese with English Prompt',
28
  ]
29
 
30
 
31
+ asr_malay_datasets = [
32
+ 'YouTube ASR: Malay with English Prompt'
33
+ ]
34
+
35
+
36
+ asr_tamil_datasets = [
37
  'CommonVoice-17-Tamil',
38
+ 'Fleurs-Tamil',
39
+ 'YouTube ASR: Tamil with English Prompt'
40
+ ]
41
+
42
+
43
+ asr_indonesian_datasets = [
44
+ 'CommonVoice-17-Indonesian',
45
  'GigaSpeech-2-Indonesain',
46
+ ]
47
+
48
+
49
+ asr_thai_datasets = [
50
  'GigaSpeech-2-Thai',
 
 
51
  'Lotus-Thai'
52
  ]
53
 
54
 
55
+ asr_vietnamese_datasets = [
56
+ 'CommonVoice-17-Vietnamese',
57
+ 'GigaSpeech-2-Vietnamese'
58
+ ]
59
+
60
+
61
  asr_private_datasets = [
62
  'CNA',
63
  'IDPC',
 
70
  'Mediacorp-Short',
71
  'YouTube ASR: English Singapore Content',
72
  'YouTube ASR: English with Strong Emotion',
 
 
 
73
  ]
74
 
75
 
app/pages.py CHANGED
@@ -180,12 +180,12 @@ def asr_mandarin():
180
  draw('su', 'asr_mandarin', filter_1, 'wer')
181
 
182
 
183
- def asr_sea():
184
- st.title("Task: Automatic Speech Recognition - SEA Languages")
185
 
186
  sum = ['Overall']
187
 
188
- filters_levelone = sum + asr_sea_datasets
189
 
190
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
191
 
@@ -194,10 +194,90 @@ def asr_sea():
194
 
195
  if filter_1:
196
  if filter_1 in sum:
197
- sum_table_mulit_metrix('asr_sea', ['wer'])
198
  else:
199
  dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
200
- draw('su', 'asr_sea', filter_1, 'wer')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
 
203
  def asr_private():
 
180
  draw('su', 'asr_mandarin', filter_1, 'wer')
181
 
182
 
183
+ def asr_malay():
184
+ st.title("Task: Automatic Speech Recognition - Malay")
185
 
186
  sum = ['Overall']
187
 
188
+ filters_levelone = sum + asr_malay_datasets
189
 
190
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
191
 
 
194
 
195
  if filter_1:
196
  if filter_1 in sum:
197
+ sum_table_mulit_metrix('asr_malay', ['wer'])
198
  else:
199
  dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
200
+ draw('su', 'asr_malay', filter_1, 'wer')
201
+
202
+
203
+ def asr_tamil():
204
+ st.title("Task: Automatic Speech Recognition - Tamil")
205
+
206
+ sum = ['Overall']
207
+
208
+ filters_levelone = sum + asr_tamil_datasets
209
+
210
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
211
+
212
+ with left:
213
+ filter_1 = st.selectbox('Dataset', filters_levelone)
214
+
215
+ if filter_1:
216
+ if filter_1 in sum:
217
+ sum_table_mulit_metrix('asr_tamil', ['wer'])
218
+ else:
219
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
220
+ draw('su', 'asr_tamil', filter_1, 'wer')
221
+
222
+
223
+ def asr_indonesian():
224
+ st.title("Task: Automatic Speech Recognition - Indonesian")
225
+
226
+ sum = ['Overall']
227
+
228
+ filters_levelone = sum + asr_indonesian_datasets
229
+
230
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
231
+
232
+ with left:
233
+ filter_1 = st.selectbox('Dataset', filters_levelone)
234
+
235
+ if filter_1:
236
+ if filter_1 in sum:
237
+ sum_table_mulit_metrix('asr_indonesian', ['wer'])
238
+ else:
239
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
240
+ draw('su', 'asr_indonesian', filter_1, 'wer')
241
+
242
+
243
+ def asr_thai():
244
+ st.title("Task: Automatic Speech Recognition - Thai")
245
+
246
+ sum = ['Overall']
247
+
248
+ filters_levelone = sum + asr_thai_datasets
249
+
250
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
251
+
252
+ with left:
253
+ filter_1 = st.selectbox('Dataset', filters_levelone)
254
+
255
+ if filter_1:
256
+ if filter_1 in sum:
257
+ sum_table_mulit_metrix('asr_thai', ['wer'])
258
+ else:
259
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
260
+ draw('su', 'asr_thai', filter_1, 'wer')
261
+
262
+
263
+ def asr_vietnamese():
264
+ st.title("Task: Automatic Speech Recognition - Vietnamese")
265
+
266
+ sum = ['Overall']
267
+
268
+ filters_levelone = sum + asr_vietnamese_datasets
269
+
270
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
271
+
272
+ with left:
273
+ filter_1 = st.selectbox('Dataset', filters_levelone)
274
+
275
+ if filter_1:
276
+ if filter_1 in sum:
277
+ sum_table_mulit_metrix('asr_vietnamese', ['wer'])
278
+ else:
279
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
280
+ draw('su', 'asr_vietnamese', filter_1, 'wer')
281
 
282
 
283
  def asr_private():
process_log.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import sys
4
+ import json
5
+ import random
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+
10
+ from app.content import *
11
+
12
+ data_to_df = []
13
+
14
+
15
+ log_dir = "path/to/audiobench/log"
16
+
17
+ all_evaluated_models = os.listdir(log_dir)
18
+ for model_name in all_evaluated_models:
19
+ if "geyu_whisper" in model_name:
20
+ continue
21
+
22
+ if "activation_checkpointing" in model_name:
23
+ continue
24
+
25
+ model_dir = os.path.join(log_dir, model_name)
26
+
27
+ if not os.path.isdir(model_dir):
28
+ continue
29
+
30
+ for log_file in os.listdir(model_dir):
31
+ if not log_file.endswith("score.json"):
32
+ continue
33
+
34
+ match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file)
35
+ ds_name = match.group(1)
36
+ metrics = match.group(2)
37
+
38
+ eval_path = os.path.join(model_dir, log_file)
39
+
40
+ with open(eval_path, "r") as f:
41
+ eval_data = json.load(f)
42
+
43
+ if metrics == "llama3_70b_judge":
44
+ value = eval_data[metrics]["judge_score"]
45
+ elif metrics == "wer":
46
+ value = eval_data[metrics]
47
+ elif metrics == "bleu":
48
+ value = eval_data[metrics]
49
+
50
+ data_to_df.append([model_name, ds_name, metrics, value])
51
+
52
+
53
+ eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"])
54
+ eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION")
55
+
56
+ # original results_organized
57
+ archive_results_dir = "results_organized_archive"
58
+ output_results_dir = "results_organized"
59
+
60
+
61
+ def merge_results(display_datasets, metrics, result_sub_path=None):
62
+ raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets]
63
+
64
+ new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)]
65
+ new_result = new_result.drop(columns=["metrics"])
66
+ new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index()
67
+ new_result = new_result.rename(columns={"model": "Model"})
68
+ new_result = new_result.dropna(axis=0, how="any")
69
+
70
+ archive_result_path = os.path.join(archive_results_dir, result_sub_path)
71
+ if os.path.exists(archive_result_path):
72
+ archive_result = pd.read_csv(archive_result_path)
73
+ archive_columns = [col for col in archive_result.columns if col in raw_ds_names]
74
+ archive_result = archive_result[["Model"] + archive_columns]
75
+ combined_result = pd.concat([archive_result, new_result], ignore_index=True)
76
+ combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True)
77
+
78
+ return new_result, combined_result
79
+
80
+ return new_result, new_result
81
+
82
+
83
+ result_file_mapper = {
84
+ "bleu/st.csv": speech_translation_datasets,
85
+ "llama3_70b_judge/accent_recognition.csv": ar_datasets,
86
+ "llama3_70b_judge/audio_captioning.csv": ac_datasets,
87
+ "llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets,
88
+ "llama3_70b_judge/emotion_recognition.csv": er_datasets,
89
+ "llama3_70b_judge/gender_recognition.csv": gr_datasets,
90
+ "llama3_70b_judge/music_understanding.csv": music_datasets,
91
+ "llama3_70b_judge/sds_singlish.csv": sds_datasets,
92
+ "llama3_70b_judge/speech_instruction.csv": si_datasets,
93
+ "llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets,
94
+ "llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets,
95
+ "llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets,
96
+ "meteor/audio_captioning.csv": ac_datasets,
97
+ "wer/asr_english.csv": asr_english_datasets,
98
+ "wer/asr_singlish.csv": asr_singlish_datasets,
99
+ "wer/asr_mandarin.csv": asr_mandarin_datasets,
100
+ "wer/asr_malay.csv": asr_malay_datasets,
101
+ "wer/asr_tamil.csv": asr_tamil_datasets,
102
+ "wer/asr_indonesian.csv": asr_indonesian_datasets,
103
+ "wer/asr_thai.csv": asr_thai_datasets,
104
+ "wer/asr_vietnamese.csv": asr_vietnamese_datasets,
105
+ "wer/asr_private.csv": asr_private_datasets,
106
+ "wer/under_development_wer.csv": wer_development_datasets,
107
+ }
108
+
109
+
110
+ for sub_path, display_ds in result_file_mapper.items():
111
+ metrics = sub_path.split("/")[0]
112
+ new_result, combined_result = merge_results(display_ds, metrics, sub_path)
113
+
114
+ output_path = os.path.join(output_results_dir, sub_path)
115
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
116
+ combined_result.to_csv(output_path, index=False)
results_organized/bleu/st.csv CHANGED
@@ -9,8 +9,8 @@ WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994,5.9335222
9
  MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
10
  MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
11
  MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
12
- Qwen2.5-Omni-3B,3.2577143149506815,10.28866767786604,0.020665917336912663,15.00712601210481,8.98152195711894,0.04161842995351044
13
- Qwen2.5-Omni-7B,2.612412992528698,12.429229982446326,0.05482974047730791,12.471476026200369,9.974234734341179,0.02999794683579762
14
  SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
15
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
16
  cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
 
9
  MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
10
  MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
11
  MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
12
+ Qwen2.5-Omni-3B,22.677415597466005,41.3904266408345,0.11385307865833577,44.70177381121325,21.564197151391852,0.2121024080246949
13
+ Qwen2.5-Omni-7B,22.381473837803917,40.43638195419091,0.7240804450352291,43.844763499607055,16.686179809018903,0.05656546920236443
14
  SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
15
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
16
  cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
results_organized/llama3_70b_judge/accent_recognition.csv CHANGED
@@ -6,12 +6,12 @@ whisper_large_v3,,,
6
  old_models,,,
7
  gemini-1.5-flash,,,
8
  WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
9
- MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064,,
10
- MERaLiON-AudioLLM-v2-2b,66.59827656955272,,
11
- MERaLiON-AudioLLM-v2-9b,40.78785391875257,,
12
- Qwen2.5-Omni-3B,0.9027492819039803,,
13
- Qwen2.5-Omni-7B,1.661879359868691,,
14
- SALMONN_7B,31.69881001231022,,
15
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757,,
16
- cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221,,
17
- phi_4_multimodal_instruct,2.6261797291752154,,
 
6
  old_models,,,
7
  gemini-1.5-flash,,,
8
  WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064,6.333333333333334,78.0
10
+ MERaLiON-AudioLLM-v2-2b,66.59827656955272,59.73333333333334,53.833333333333336
11
+ MERaLiON-AudioLLM-v2-9b,40.78785391875257,30.325000000000006,54.333333333333336
12
+ Qwen2.5-Omni-3B,0.9027492819039803,0.1,0.4333333333333333
13
+ Qwen2.5-Omni-7B,1.661879359868691,0.06666666666666667,0.03333333333333333
14
+ SALMONN_7B,31.69881001231022,2.833333333333333,0.2
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757,38.983333333333334,10.8
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221,13.733333333333334,10.166666666666666
17
+ phi_4_multimodal_instruct,2.6261797291752154,6.133333333333333,0.5333333333333333
results_organized/llama3_70b_judge/sqa_english.csv CHANGED
@@ -9,6 +9,8 @@ WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894
9
  MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
10
  MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
11
  MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
 
 
12
  SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
13
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
14
  cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
 
9
  MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
10
  MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
11
  MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
12
+ Qwen2.5-Omni-3B,73.87254901960785,61.07558139534884,59.8504952345356,81.41787758696609,69.99477260846837,60.699999999999996
13
+ Qwen2.5-Omni-7B,77.30392156862746,61.71511627906977,62.86675387777986,81.72611184500221,70.77888133821223,56.10000000000001
14
  SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
15
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
16
  cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
results_organized/wer/asr_english.csv CHANGED
@@ -1,7 +1,6 @@
1
  Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
2
  Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
3
  Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
4
- whisper_large_v3,0.0187874900969555,0.0366012824635405,0.1000186374123559,0.1460242061533738,0.0945902243481269,0.1186395926671187,0.158878997371161,0.0376494801461977,0.032086509484134
5
  old_models,,,,,,,,,
6
  gemini-1.5-flash,,,,,,,,,
7
  WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
@@ -9,10 +8,11 @@ MERaLiON-AudioLLM-Whisper-SEA-LION,0.023937073225940318,0.0422569845082944,0.077
9
  MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
10
  MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
11
  MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
12
- Qwen2.5-Omni-3B,0.01765571358509073,0.03898462178674788,0.08397118270448134,0.2217852079375585,0.09894231227233641,0.12490689375326566,0.18720009894897133,0.03211383556296796,0.052153873426697396
13
- Qwen2.5-Omni-7B,0.02252235258610933,0.04165169198176556,0.08635548614726127,0.31617534194121266,0.12679717916513114,0.23232370957521317,0.2807910240306093,0.0633760334977467,0.09094132246055664
14
  SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
15
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
16
  cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
17
  hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
18
  phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
 
 
1
  Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
2
  Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
3
  Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
 
4
  old_models,,,,,,,,,
5
  gemini-1.5-flash,,,,,,,,,
6
  WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
 
8
  MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
9
  MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
10
  MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
11
+ Qwen2.5-Omni-3B,0.021107631946278342,0.04492405470331209,0.0940546654584482,0.2615060102759792,0.11446542772550759,0.14654089448699847,0.19688006593894564,0.04804655619034101,0.07147668853040241
12
+ Qwen2.5-Omni-7B,0.04404496925340476,0.06877636332683905,0.08028226039678409,0.3124105638254503,0.13967544855837088,0.18939756089426465,0.24105023789319796,0.049146588126752065,0.08381492643148378
13
  SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
14
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
15
  cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
16
  hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
17
  phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
18
+ whisper_large_v3,0.02240917493492285,0.03928726805001229,0.09987082345229144,0.15004646142216516,0.09818771638225171,0.13208580227012484,0.16471049822226413,0.04130442496717646,0.04538202446374756
results_organized/wer/asr_indonesian.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,commonvoice_17_id_asr,gigaspeech2_id_test
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.337184855698226
3
+ MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.17842684134623737
4
+ MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.1722759890883186
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.16282383194620612
6
+ Qwen2.5-Omni-3B,0.13579906155120067,0.2746338157871875
7
+ Qwen2.5-Omni-7B,0.10994571717729322,0.22737303007662502
8
+ SALMONN_7B,1.1888858220627472,2.1181172136986777
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.2191718937327333
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.1926224523482703
11
+ hy_whisper_local_cs,0.10267733922163952,0.21382030476256667
12
+ phi_4_multimodal_instruct,1.327169012788665,5.803850364012302
13
+ whisper_large_v3,0.07512190633912963,0.1961496359876983
results_organized/wer/asr_malay.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,ytb_asr_batch3_malay
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.28989513404414025
3
+ MERaLiON-AudioLLM-v2-2b,0.2798911851169321
4
+ MERaLiON-AudioLLM-v2-9b,0.20907375718485366
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.19463823439076827
6
+ Qwen2.5-Omni-3B,2.943749725768944
7
+ Qwen2.5-Omni-7B,1.4606642973103419
8
+ SALMONN_7B,1.0858672282918695
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.3143784827344127
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.3119213724715897
11
+ hy_whisper_local_cs,0.2421569917950068
12
+ phi_4_multimodal_instruct,3.762932736606555
13
+ whisper_large_v3,0.259620025448642
results_organized/wer/asr_mandarin.csv CHANGED
@@ -1,18 +1,18 @@
1
- Model,aishell_asr_zh_test,commonvoice_zh_asr
2
- Qwen-Audio-Chat,0.9469917443725128,
3
- Qwen2-Audio-7B-Instruct,0.0926035912969452,
4
- whisper_large_v3,0.1235968402922135,
5
- old_models,,
6
- gemini-1.5-flash,,
7
- WavLLM_fairseq,0.7054601967888183,
8
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.12846706657955692,0.3269799259362027
9
- MERaLiON-AudioLLM-v2-2b,0.05010789728969927,0.13139387212789344
10
- MERaLiON-AudioLLM-v2-9b,0.05789827958266516,0.14684695260557293
11
- MERaLiON-AudioLLM-v2-9b-asr,0.043317297222387204,0.1183419954537208
12
- Qwen2.5-Omni-3B,0.08080418126744669,0.08551487145555639
13
- Qwen2.5-Omni-7B,0.08943596444338857,0.0775535468448182
14
- SALMONN_7B,0.9314703727900854,1.0013340021130595
15
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20889509215814378,0.31938144990021666
16
- cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111,0.1962263748225777
17
- hy_whisper_local_cs,0.15675793391538476,0.287290695068461
18
- phi_4_multimodal_instruct,0.12232978955079092,0.154221316286565
 
1
+ Model,aishell_asr_zh_test,commonvoice_zh_asr,ytb_asr_batch3_chinese
2
+ Qwen-Audio-Chat,0.9469917443725128,,
3
+ Qwen2-Audio-7B-Instruct,0.0926035912969452,,
4
+ old_models,,,
5
+ gemini-1.5-flash,,,
6
+ WavLLM_fairseq,0.7054601967888183,,
7
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.12846706657955692,0.3269799259362027,0.418102808691044
8
+ MERaLiON-AudioLLM-v2-2b,0.05010789728969927,0.13139387212789344,0.25613142554319024
9
+ MERaLiON-AudioLLM-v2-9b,0.05789827958266516,0.14684695260557293,0.19133015368309486
10
+ MERaLiON-AudioLLM-v2-9b-asr,0.043317297222387204,0.1183419954537208,0.1494223635400106
11
+ Qwen2.5-Omni-3B,0.02807309298964582,0.11308069111981474,0.25013248542660305
12
+ Qwen2.5-Omni-7B,0.02438082793846885,0.07647567313746625,0.20640169581346052
13
+ SALMONN_7B,0.9314703727900854,1.0013340021130595,0.8858293587705353
14
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20889509215814378,0.31938144990021666,0.3469210386857446
15
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111,0.1962263748225777,0.2698675145733969
16
+ hy_whisper_local_cs,0.15675793391538476,0.287290695068461,0.27520932697403283
17
+ phi_4_multimodal_instruct,0.12232978955079092,0.154221316286565,0.44008479067302597
18
+ whisper_large_v3,0.1233691671121142,0.19822204198371451,0.2663275039745628
results_organized/wer/asr_private.csv CHANGED
@@ -1,12 +1,13 @@
1
- Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2,ytb_asr_batch3_chinese,ytb_asr_batch3_malay,ytb_asr_batch3_tamil
2
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463,0.418102808691044,0.28989513404414025,0.6929759165018962
3
- MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278,0.25613142554319024,0.2798911851169321,0.7504943113675407
4
- MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743,0.19133015368309486,0.20907375718485366,0.6644679264853651
5
- MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478,0.1494223635400106,0.19463823439076827,0.5467894071504975
6
- Qwen2.5-Omni-3B,0.15224821104346897,0.3038635572572224,0.19743370402053037,0.13660894985383404,0.1391702058150931,0.09165957044185827,0.0828512006050293,0.12241683951755397,0.24802681370959023,0.2562374138844727,2.2815585099381335,1.2873650773070564
7
- Qwen2.5-Omni-7B,0.17280786072839902,0.4491820396797772,0.6198460222412319,0.26714639082527547,0.3391048676902973,0.2558898665909736,0.22628096048402344,0.20300376430821235,0.34827548924208024,0.19881293057763647,1.4799262866921152,1.0804025801432693
8
- SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506,0.8858293587705353,1.0858672282918695,0.985267900554277
9
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205,0.3469210386857446,0.3143784827344127,0.9665002755178114
10
- cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398,0.2698675145733969,0.3119213724715897,0.8976532365239376
11
- hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155,0.27520932697403283,0.2421569917950068,0.8339924151567211
12
- phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986,0.44008479067302597,3.762932736606555,2.7500567242552916
 
 
1
+ Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463
3
+ MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278
4
+ MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478
6
+ Qwen2.5-Omni-3B,0.17398269785325216,0.2111033762617473,0.19863130881094954,0.1476276141218799,0.1515844495262986,0.10048254328697133,0.09075439591605218,0.1622877775217024,0.24454535625473023
7
+ Qwen2.5-Omni-7B,0.18322118978959734,0.41385311521058127,0.22035928143712574,0.14144366988981336,0.23546226723293043,0.11022802535717664,0.176214785403668,0.17361911346700468,0.35052438101416367
8
+ SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398
11
+ hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155
12
+ phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986
13
+ whisper_large_v3,0.1376695503577913,0.2201531500174034,0.1787852865697177,0.12671463908252756,0.12904279647174127,0.08962531933011637,0.12263187748156551,0.13336406237996465,0.1289869175045951
results_organized/wer/asr_sea.csv DELETED
@@ -1,12 +0,0 @@
1
- Model,commonvoice_17_id_asr,commonvoice_17_ta_asr,commonvoice_17_vi_asr,fleurs_tamil_ta_30_asr,gigaspeech2_id_test,gigaspeech2_th_test,gigaspeech2_vi_test,lotus_thai_th_30_asr
2
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.5284951114826634,0.9221892864704637,0.4624736472241743,0.337184855698226,0.9866395307075302,0.9818897503814326,0.8520208370756243
3
- MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.13853008043879414,0.14196485284776625,0.1432185523541813,0.17842684134623737,0.19968394588770502,0.16825573283269715,0.014873360876594216
4
- MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.15591770571023683,0.15646834639000634,0.16085734364019677,0.1722759890883186,0.20004788698671136,0.11314793912959634,0.018681516076881625
5
- MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.12871226564172622,0.1423883125132331,0.1383345045678145,0.16282383194620612,0.18238237758889023,0.09499798648962901,0.010670019759295851
6
- Qwen2.5-Omni-3B,0.13731714049130556,1.0276387288835422,0.2463476603853483,1.3477160927617708,0.3110002953799107,0.4670274152998923,0.19581530154444754,0.4822705227231902
7
- Qwen2.5-Omni-7B,0.18235348238108381,1.0684188526512177,0.22041075587550285,1.2090302178496135,0.26146334682814104,0.2936956781994493,0.22408385278119664,0.0984012933357284
8
- SALMONN_7B,1.1888858220627472,1.4272941368377052,1.496294727927165,1.507519325368939,2.1181172136986777,1.2470441757452413,1.5460526688938172,1.1351535836177475
9
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.23805397249380653,0.1567859411391065,0.2724525650035137,0.2191718937327333,0.276058900993655,0.17136958408249153,0.06815160768816239
10
- cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.24404355317218387,0.11676900275248782,0.28397751229796203,0.1926224523482703,0.20872022028013887,0.15538061017872032,0.031794503323154304
11
- hy_whisper_local_cs,0.10267733922163952,0.31793713743921215,0.1681134871903451,0.33113141250878425,0.21382030476256667,0.26486292350053875,0.1781020821398794,0.076019400035926
12
- phi_4_multimodal_instruct,1.327169012788665,1.1784589191228196,1.1070294304467498,1.7016514406184118,5.803850364012302,1.7344522925894887,2.5042567310800923,1.2856834920064666
 
 
 
 
 
 
 
 
 
 
 
 
 
results_organized/wer/asr_singlish.csv CHANGED
@@ -1,7 +1,6 @@
1
  Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
2
  Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
3
  Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
4
- whisper_large_v3,0.0684417136030039,0.3171008846684522,0.2702636652456078,0.4618189591218298,0.2143555471246589,0.1698509342851144
5
  old_models,,,,,,
6
  gemini-1.5-flash,,,,,,
7
  WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
@@ -9,10 +8,11 @@ MERaLiON-AudioLLM-Whisper-SEA-LION,0.04303513520103382,0.0473581689797906,0.2129
9
  MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
10
  MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
11
  MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
12
- Qwen2.5-Omni-3B,0.04657059956599127,0.11265319373427482,0.49541097564287073,1.0728162054093475,0.273861464154908,0.17795830036014793
13
- Qwen2.5-Omni-7B,0.04854558310779509,0.12052593133674215,0.6256143590300595,1.1316375158747123,0.34107192365498823,0.36374941455772863
14
  SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
15
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
16
  cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
17
  hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
18
  phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
 
 
1
  Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
2
  Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
3
  Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
 
4
  old_models,,,,,,
5
  gemini-1.5-flash,,,,,,
6
  WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
 
8
  MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
9
  MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
10
  MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
11
+ Qwen2.5-Omni-3B,0.05298320044863824,0.0947975002029056,0.47520840687539034,1.2504495215581737,0.27988793392771155,0.18302944168994978
12
+ Qwen2.5-Omni-7B,0.05291005291005291,0.09410762113464816,0.5354359573139272,1.3034993524374756,0.37375786140578715,0.27471373891697215
13
  SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
14
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
15
  cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
16
  hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
17
  phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
18
+ whisper_large_v3,0.06853924365445102,0.3183183183183183,0.31976538952399053,0.5026468332306454,0.23660825028089477,0.19798446357337812
results_organized/wer/asr_tamil.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,commonvoice_17_ta_asr,fleurs_tamil_ta_30_asr,ytb_asr_batch3_tamil
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.5284951114826634,0.4624736472241743,0.6929759165018962
3
+ MERaLiON-AudioLLM-v2-2b,0.13853008043879414,0.1432185523541813,0.7504943113675407
4
+ MERaLiON-AudioLLM-v2-9b,0.15591770571023683,0.16085734364019677,0.6644679264853651
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.12871226564172622,0.1383345045678145,0.5467894071504975
6
+ Qwen2.5-Omni-3B,0.8307319012713203,1.653935347856641,1.4607630222683219
7
+ Qwen2.5-Omni-7B,0.8465494917777076,0.8666549543218552,1.3615441962983372
8
+ SALMONN_7B,1.4272941368377052,1.507519325368939,0.985267900554277
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.23805397249380653,0.2724525650035137,0.9665002755178114
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.24404355317218387,0.28397751229796203,0.8976532365239376
11
+ hy_whisper_local_cs,0.31793713743921215,0.33113141250878425,0.8339924151567211
12
+ phi_4_multimodal_instruct,1.1784589191228196,1.7016514406184118,2.7500567242552916
13
+ whisper_large_v3,0.2713203584572879,0.276317638791286,0.8413665683446242
results_organized/wer/asr_thai.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,gigaspeech2_th_test,lotus_thai_th_30_asr
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.9866395307075302,0.8520208370756243
3
+ MERaLiON-AudioLLM-v2-2b,0.19968394588770502,0.014873360876594216
4
+ MERaLiON-AudioLLM-v2-9b,0.20004788698671136,0.018681516076881625
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.18238237758889023,0.010670019759295851
6
+ Qwen2.5-Omni-3B,0.3000742248294026,0.026225974492545358
7
+ Qwen2.5-Omni-7B,0.23150963725607565,0.021483743488413868
8
+ SALMONN_7B,1.2470441757452413,1.1351535836177475
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.276058900993655,0.06815160768816239
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.20872022028013887,0.031794503323154304
11
+ hy_whisper_local_cs,0.26486292350053875,0.076019400035926
12
+ phi_4_multimodal_instruct,1.7344522925894887,1.2856834920064666
13
+ whisper_large_v3,0.22202801388722615,0.03933896173881803
results_organized/wer/asr_vietnamese.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,commonvoice_17_vi_asr,gigaspeech2_vi_test
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.9221892864704637,0.9818897503814326
3
+ MERaLiON-AudioLLM-v2-2b,0.14196485284776625,0.16825573283269715
4
+ MERaLiON-AudioLLM-v2-9b,0.15646834639000634,0.11314793912959634
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.1423883125132331,0.09499798648962901
6
+ Qwen2.5-Omni-3B,0.19648528477662502,0.17708681916408126
7
+ Qwen2.5-Omni-7B,0.18367562989625238,0.22730546937479085
8
+ SALMONN_7B,1.496294727927165,1.5460526688938172
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.1567859411391065,0.17136958408249153
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.11676900275248782,0.15538061017872032
11
+ hy_whisper_local_cs,0.1681134871903451,0.1781020821398794
12
+ phi_4_multimodal_instruct,1.1070294304467498,2.5042567310800923
13
+ whisper_large_v3,0.12873173830192675,0.17700741312128138
results_organized_archive/bleu/st.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
2
+ Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.03451483807236294,0.45648619714728844,9.898238298955656,0.01699144301093184
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,37.60224687716629,43.941098854450516,14.407399367512914,44.43289180618449,18.76473995941838,5.023057608950299
4
+ hy_whisper_local_cs,1.0869208512565696,0.10573269629215352,0.008950516549431693,22.267131378964944,7.31707791416422,2.8610263518826757
5
+ Qwen2-Audio-7B-Instruct,16.325186897428104,25.765420247070075,0.03245972071872916,6.326113431899141,16.466557744958333,0.04425838146050298
6
+ whisper_large_v3,1.600581653970121,0.16408986541757878,0.02107778621423822,46.01512198258627,14.673689493155793,2.451098639578599
7
+ old_models,,,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.620150160643625,35.274306071307024,8.433062902024755,46.80524126004861,15.209998552437538,2.8327095799289337
9
+ gemini-1.5-flash,,,,,,
10
+ WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994286,5.933522277713613,2.368659001743569,0.1695522548322915
11
+ SALMONN_7B,14.102682915273142,33.88941292215531,0.00046745670226766583,26.89649039333571,5.296039450108202,0.3649023706010388
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,10.930203684508578,5.987143868370054,1.0368044741318085,46.79924664837527,14.154700735606419,2.4245628096245917
results_organized_archive/llama3_70b_judge/accent_recognition.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
2
+ Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,47.01682396389003,7.816666666666666,77.83333333333333
4
+ hy_whisper_local_cs,,,
5
+ Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.9666666666666667
6
+ whisper_large_v3,,,
7
+ old_models,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.640951990151827,26.016666666666666,7.633333333333334
9
+ gemini-1.5-flash,,,
10
+ WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.23333333333333336
11
+ SALMONN_7B,34.222404595814524,2.5166666666666666,0.06666666666666667
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,39.32704144439885,12.416666666666666,9.666666666666666
results_organized_archive/llama3_70b_judge/audio_captioning.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,audiocaps_test,wavcaps_test
2
+ Qwen-Audio-Chat,47.04090909090909,32.9364161849711
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,38.00454545454545,33.97687861271676
4
+ hy_whisper_local_cs,,
5
+ Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
6
+ whisper_large_v3,,
7
+ old_models,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,3.0954545454545457,6.3468208092485545
9
+ gemini-1.5-flash,,
10
+ WavLLM_fairseq,5.5,6.901734104046243
11
+ SALMONN_7B,37.445454545454545,23.76878612716763
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,2.4727272727272727,3.445086705202312
results_organized_archive/llama3_70b_judge/audio_scene_question_answering.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
2
+ Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,63.15021876519203,49.77635782747604,46.31578947368421
4
+ hy_whisper_local_cs,,,
5
+ Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
6
+ whisper_large_v3,,,
7
+ old_models,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.647544968400585,18.466453674121407,18.88157894736842
9
+ gemini-1.5-flash,,,
10
+ WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
11
+ SALMONN_7B,57.75401069518716,50.287539936102235,47.30263157894737
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,29.47134606841404,17.380191693290733,16.710526315789473
results_organized_archive/llama3_70b_judge/emotion_recognition.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
2
+ Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,48.505976095617534,46.206896551724135,36.36015325670498
4
+ hy_whisper_local_cs,,,
5
+ Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
6
+ whisper_large_v3,,,
7
+ old_models,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,44.322709163346616,56.59003831417625,47.356321839080465
9
+ gemini-1.5-flash,,,
10
+ WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
11
+ SALMONN_7B,23.804780876494025,41.7624521072797,30.536398467432953
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,46.713147410358566,45.593869731800766,36.81992337164751
results_organized_archive/llama3_70b_judge/gender_recognition.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,voxceleb_gender_test,iemocap_gender_test,imda_gr_sentence,imda_gr_dialogue
2
+ Qwen-Audio-Chat,70.5990972507181,50.0996015936255,57.550000000000004,37.2
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,99.75379565038982,93.48605577689243,66.13333333333333,93.76666666666667
4
+ hy_whisper_local_cs,,,,
5
+ Qwen2-Audio-7B-Instruct,99.1177677472302,92.80876494023903,68.38333333333333,61.56666666666667
6
+ whisper_large_v3,,,,
7
+ old_models,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,34.94050061551087,15.737051792828685,26.35,19.6
9
+ gemini-1.5-flash,,,,
10
+ WavLLM_fairseq,69.61427985227739,51.932270916334666,49.06666666666666,46.766666666666666
11
+ SALMONN_7B,88.79770209273697,81.31474103585658,59.766666666666666,42.733333333333334
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,42.921624948707425,44.22310756972111,36.016666666666666,25.433333333333337
results_organized_archive/llama3_70b_judge/music_understanding.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,muchomusic_test
2
+ Qwen-Audio-Chat,59.0564448188711
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,57.7927548441449
4
+ hy_whisper_local_cs,
5
+ Qwen2-Audio-7B-Instruct,71.60909856781802
6
+ whisper_large_v3,
7
+ old_models,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.727042965459134
9
+ gemini-1.5-flash,
10
+ WavLLM_fairseq,44.3133951137321
11
+ SALMONN_7B,50.88458298230834
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,56.44481887110362
results_organized_archive/llama3_70b_judge/sds_singlish.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
2
+ Qwen-Audio-Chat,16.4,16.0,28.2,40.4
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,48.4,46.4,57.0,62.599999999999994
4
+ hy_whisper_local_cs,,,,
5
+ Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
6
+ whisper_large_v3,,,,
7
+ old_models,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,45.4,44.0,58.0,65.4
9
+ gemini-1.5-flash,,,,
10
+ WavLLM_fairseq,31.6,31.6,45.199999999999996,49.400000000000006
11
+ SALMONN_7B,9.0,7.0,17.2,24.2
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,37.400000000000006,36.0,49.0,57.199999999999996
results_organized_archive/llama3_70b_judge/speech_instruction.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,openhermes_audio_test,alpaca_audio_test
2
+ Qwen-Audio-Chat,10.600000000000001,9.8
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,65.6,74.80000000000001
4
+ hy_whisper_local_cs,,
5
+ Qwen2-Audio-7B-Instruct,44.800000000000004,52.599999999999994
6
+ whisper_large_v3,,
7
+ old_models,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,72.2,73.8
9
+ gemini-1.5-flash,,
10
+ WavLLM_fairseq,19.2,21.6
11
+ SALMONN_7B,15.8,17.2
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,63.0,70.8
results_organized_archive/llama3_70b_judge/sqa_english.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test
2
+ Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.232056362835756,59.749085206481965
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,86.76470588235293,59.7093023255814,73.66473556344609,88.50726552179657,84.31782540512285
4
+ hy_whisper_local_cs,,,,,
5
+ Qwen2-Audio-7B-Instruct,80.04901960784315,58.31395348837209,64.86264249672958,74.7247908410392,66.49242028227914
6
+ whisper_large_v3,,,,,
7
+ old_models,,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,88.57843137254902,73.11046511627907,88.61894972902262,91.85380889476001,89.33612127548353
9
+ gemini-1.5-flash,,,,89.25583443416997,
10
+ WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894319684,66.5446941975954
11
+ SALMONN_7B,83.48039215686273,59.24418604651163,66.39506634273968,50.99075297225891,56.455828541557764
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,82.99019607843137,64.94186046511628,83.81984675761541,85.2928225451343,86.4610559330894
results_organized_archive/llama3_70b_judge/sqa_singlish.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
2
+ Qwen-Audio-Chat,32.2,37.8,47.800000000000004,51.4
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,51.4,53.2,64.80000000000001,67.2
4
+ hy_whisper_local_cs,,,,
5
+ Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
6
+ whisper_large_v3,,,,
7
+ old_models,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,56.0,66.0,74.0,71.6
9
+ gemini-1.5-flash,,,,
10
+ WavLLM_fairseq,45.199999999999996,46.6,50.8,62.199999999999996
11
+ SALMONN_7B,40.599999999999994,36.6,44.6,46.8
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,49.0,53.8,57.800000000000004,64.0
results_organized_archive/llama3_70b_judge/under_development_llama3_70b_judge.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,ytb_sqa_batch1,ytb_sds_batch1,ytb_pqa_batch1
2
+ Qwen-Audio-Chat,60.827586206896555,43.878954607977995,37.16117216117216
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,64.51231527093596,53.97524071526823,40.97069597069597
4
+ hy_whisper_local_cs,60.137931034482754,60.9353507565337,45.78754578754578
5
+ Qwen2-Audio-7B-Instruct,60.453201970443345,51.5818431911967,36.97802197802198
6
+ whisper_large_v3,,,
7
+ old_models,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
9
+ gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
10
+ WavLLM_fairseq,60.70935960591133,55.625859697386524,40.95238095238095
11
+ SALMONN_7B,55.665024630541865,31.279229711141674,32.124542124542124
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
results_organized_archive/meteor/audio_captioning.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,audiocaps_test,wavcaps_test
2
+ Qwen-Audio-Chat,0.27553015076950976,0.2355106805560457
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.24920047034353812,0.3175511907248581
4
+ hy_whisper_local_cs,,
5
+ Qwen2-Audio-7B-Instruct,0.19891712076314283,0.21342294856199182
6
+ whisper_large_v3,,
7
+ old_models,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.05796819723943051,0.120421856260385
9
+ gemini-1.5-flash,,
10
+ WavLLM_fairseq,0.041732965094428545,0.06399522524688675
11
+ SALMONN_7B,0.20994052484339956,0.17175112770658157
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.07953048457785493,0.1388630786594543
results_organized_archive/wer/asr_english.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
2
+ Qwen-Audio-Chat,0.020258799562379748,0.043467569561352074,0.11272421128398918,0.31419144746723354,0.13018910022587737,0.2655529121410546,0.3664994875132684,0.04052375714133636,0.2911540507002305
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.024333195005092994,0.04212457676811621,0.07789795695400416,0.21632867288683053,0.14468436081215577,0.1384587164122689,0.16563713100701868,0.08094105957914907,0.10501684098564085
4
+ hy_whisper_local_cs,0.02554042328441544,0.053417065466169825,0.1066766923091754,0.1991585778678581,0.0948233719154953,0.10871196540338629,0.1463228189913085,0.0467690997480572,0.05275660343910654
5
+ Qwen2-Audio-7B-Instruct,0.035141660693401744,0.060415760304159495,0.11438872500819404,0.2165498391593041,0.11723812890302816,0.18872219319407232,0.23542555661330924,0.06114048472375004,0.08739585179932637
6
+ whisper_large_v3,0.01878749009695552,0.03660128246354058,0.10001863741235596,0.14602420615337386,0.09459022434812692,0.11863959266711877,0.15887899737116104,0.037649480146197796,0.03208650948413402
7
+ old_models,,,,,,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.032349945297468596,0.05307658841999735,0.10600831614192711,0.20140159998943682,0.09948381629977261,0.11416493424197618,0.1448629161356777,0.04900464852205386,0.04396383619925545
9
+ gemini-1.5-flash,,,,,,,,,
10
+ WavLLM_fairseq,0.02103218017882069,0.04798834811886432,0.14533325621300636,0.3792176325635977,0.15491778414546403,0.6447482518259942,0.6671766188447099,0.06621482559171073,0.4536784258110264
11
+ SALMONN_7B,0.10270871845172973,0.09671439650443565,0.3062255383962828,0.23699946689025367,0.10765150204693537,0.2577708974886327,0.3597423676988383,0.0459884319222171,0.14231519234178336
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.018334779492209605,0.03714982881570734,0.09876543209876543,0.14540692118393275,0.09515429104337297,0.11773910240019567,0.15611126487402763,0.038146268762641496,0.04754476156709803
results_organized_archive/wer/asr_mandarin.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,aishell_asr_zh_test
2
+ Qwen-Audio-Chat,0.9469917443725129
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.12812060739244918
4
+ hy_whisper_local_cs,0.16361782582011838
5
+ Qwen2-Audio-7B-Instruct,0.09260359129694522
6
+ whisper_large_v3,0.12359684029221357
7
+ old_models,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20886539565639167
9
+ gemini-1.5-flash,
10
+ WavLLM_fairseq,0.7054601967888183
11
+ SALMONN_7B,0.8259290055631446
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111
results_organized_archive/wer/asr_singlish.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
2
+ Qwen-Audio-Chat,0.10550313315290274,0.45479263046830615,0.6412550574306894,1.173131813552289,0.3016882870525747,0.31394240863063033
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.042815692585277836,0.04719584449314179,0.2139462894072284,0.3002929748896629,0.15368227517473845,0.10833508293092589
4
+ hy_whisper_local_cs,0.06319947333772219,0.2719340962584206,0.23856138159502538,0.33742408429629445,0.16663991478309087,0.12873269917149824
5
+ Qwen2-Audio-7B-Instruct,0.07197717796796138,0.1905689473257041,0.35076166942732234,0.5613424034000176,0.27856006770658537,0.2245352799625317
6
+ whisper_large_v3,0.06844171360300393,0.3171008846684522,0.27026366524560785,0.4618189591218298,0.2143555471246589,0.1698509342851144
7
+ old_models,,,,,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07041669714480775,0.32988393799204613,0.3035544573275043,0.4779640131272869,0.22881615619208825,0.1789273082575623
9
+ gemini-1.5-flash,,,,,,
10
+ WavLLM_fairseq,0.10077292565771828,0.4463923382842302,0.7540934640345399,1.143645714142011,0.39796588405247263,0.42541061709652933
11
+ SALMONN_7B,0.0925804013361617,0.42346400454508565,0.6569229098215983,0.7593582215292535,0.34868891450584405,0.24872817713464365
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.29992939962527493,0.4750971343786543,0.22004640235805695,0.17467982364056267
results_organized_archive/wer/under_development_wer.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,cna_test,idpc_test,parliament_test,ukusnews_test,mediacorp_test,idpc_short_test,parliament_short_test,ukusnews_short_test,mediacorp_short_test,ytb_asr_batch1,ytb_asr_batch2,seame_dev_man,seame_dev_sge,ytb_asr_batch3_malay,ytb_asr_batch3_ms_ms_prompt,ytb_asr_batch3_chinese,ytb_asr_batch3_zh_zh_prompt
2
+ Qwen-Audio-Chat,0.19753284203780838,0.7710863986313088,0.26279685873781816,0.3158631121194933,0.4498529892192094,0.6008025988916491,0.09347360821020603,0.10399586086125925,0.2548909377108163,0.2297764461857571,0.4315277327278625,0.8783373786407767,1.05567969634822,2.8890790224211313,2.8990790224211313,,
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.15924383210509452,0.30008554319931563,0.058922319992430694,0.12554358101720553,0.170859196341065,0.24918784635964075,0.056935097083623425,0.10144869855926132,0.13301101866426804,0.11484981178458939,0.15162720294085846,0.388282092772384,0.35550521901496834,0.289500241,0.3031898556447721,0.29155272919978803,0.28269210386857446
4
+ hy_whisper_local_cs,0.15710776460536152,0.19863130881094954,0.058638471000094616,0.07199848742673473,0.13124795818360013,0.17638066118861073,0.06559913359634872,0.07828544137546764,0.1154711041151338,0.11546439271721595,0.22990593577684074,0.3134101941747573,0.33199669411368576,,,,
5
+ Qwen2-Audio-7B-Instruct,0.2067713339741536,0.19093242087254064,0.23270886555019396,0.13843826810361126,0.18694870957203527,0.21326199120963119,0.08416492612361723,0.1194380323171217,0.17180121430177647,0.16843358684796805,0.2080008649583739,0.5522518878101402,0.5486546879304539,0.9251458909218551,0.9981132903339037
6
+ whisper_large_v3,0.13841717398269784,0.19880239520958085,0.0753619074652285,0.07135564378899603,0.12054884024828487,0.1662526275558953,0.05543951935226013,0.06168908700151238,0.11715763436024286,0.12226319428439733,0.17210509244242622,0.7225930420711975,0.5377268970583734,0.237374402,0.237374402,0.21278219395866454,0.21278219395866454
7
+ whisper_large_v2,,,,,,,,,,,,,,,,0.2802967673555909,0.2802967673555909
8
+ old_models,,,,,,,,,,,,,,,,,
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.16766467065868262,0.06282524363705176,0.07388920400831915,0.12455080039202875,0.16931014714313014,0.07325752301384698,0.06877338215394412,0.14571621317742298,0.1400092187139894,0.2192622950819672,0.7824973031283711,0.5840399155162387,,
10
+ gemini-1.5-flash,,,,,,,,,,0.1089344703080587,,0.9690871089536138,1.1100431601824359,,
11
+ WavLLM_fairseq,0.26946491509131687,0.7686911890504705,0.5216434856656259,0.5911892607298166,0.3595230316889905,0.36728454041658704,0.09512390087929656,0.2066783411605508,0.2621992354396222,0.41876008296842593,0.48091685587631094,1.2913969795037756,1.2204842511249197,,
12
+ SALMONN_7B,0.15395706504325538,0.4550898203592814,0.3010928186204939,0.18918510115333712,0.32089186540346293,0.26313777947639977,0.08676929424202573,0.09042426172092653,0.1751742747919946,0.21487285856956287,0.3238620391393664,1.2721817691477886,1.0189782362484312,,
13
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.13798996048275125,0.17741659538066723,0.07517267480367111,0.07642276422764227,0.13598497223129696,0.15803554366520162,0.05742502771975968,0.0700867627159118,0.11434675061839443,0.12579703464700007,0.23561466104443723,0.6848705501618123,0.507882090054792,,
14
+ Phi4-Multimodal-Instruct,,,,,,,,,,,,,,,,0.3390567037625861,0.21534711181770005