Spaces:

MERaLiON
/

AudioBench-Leaderboard

Running

App Files Files Community

AudioBench-Leaderboard / app /content.py

zhuohan-7

Upload app/content.py with huggingface_hub

d931f3e verified 2 days ago

raw

history blame contribute delete

21.2 kB

	asr_english_datasets = [
	'LibriSpeech-Clean',
	'LibriSpeech-Other',
	'CommonVoice-15-EN',
	'Peoples-Speech',
	'GigaSpeech-1',
	'Earnings-21',
	'Earnings-22',
	'TED-LIUM-3',
	'TED-LIUM-3-LongForm',
	]


	asr_singlish_datasets = [
	'MNSC-PART1-ASR',
	'MNSC-PART2-ASR',
	'MNSC-PART3-ASR',
	'MNSC-PART4-ASR',
	'MNSC-PART5-ASR',
	'MNSC-PART6-ASR',
	]


	asr_mandarin_datasets = [
	'AISHELL-ASR-ZH',
	'CommonVoice-ZH',
	'YouTube ASR: Chinese with English Prompt',
	]


	asr_malay_datasets = [
	'YouTube ASR: Malay with English Prompt'
	]


	asr_tamil_datasets = [
	'CommonVoice-17-Tamil',
	'Fleurs-Tamil',
	'YouTube ASR: Tamil with English Prompt'
	]


	asr_indonesian_datasets = [
	'CommonVoice-17-Indonesian',
	'GigaSpeech-2-Indonesain',
	]


	asr_thai_datasets = [
	'GigaSpeech-2-Thai',
	'Lotus-Thai'
	]


	asr_vietnamese_datasets = [
	'CommonVoice-17-Vietnamese',
	'GigaSpeech-2-Vietnamese'
	]


	asr_private_datasets = [
	'CNA',
	'IDPC',
	'Parliament',
	'UKUS-News',
	'Mediacorp',
	'IDPC-Short',
	'Parliament-Short',
	'UKUS-News-Short',
	'Mediacorp-Short',
	'YouTube ASR: English Singapore Content',
	'YouTube ASR: English with Strong Emotion',
	]


	speech_translation_datasets = [
	'CoVoST2-EN-ID',
	'CoVoST2-EN-ZH',
	'CoVoST2-EN-TA',
	'CoVoST2-ID-EN',
	'CoVoST2-ZH-EN',
	'CoVoST2-TA-EN'
	]


	speech_qa_english_datasets = [
	'CN-College-Listen-MCQ',
	'DREAM-TTS-MCQ',
	'SLUE-P2-SQA5',
	'Public-SG-Speech-QA',
	'Spoken-SQuAD',
	'MMAU-mini'
	]


	speech_qa_singlish_datasets = [
	'MNSC-PART3-SQA',
	'MNSC-PART4-SQA',
	'MNSC-PART5-SQA',
	'MNSC-PART6-SQA',
	]


	sds_datasets = [
	'MNSC-PART3-SDS',
	'MNSC-PART4-SDS',
	'MNSC-PART5-SDS',
	'MNSC-PART6-SDS',
	]


	si_datasets = [
	'OpenHermes-Audio',
	'ALPACA-Audio',
	]


	ac_datasets = [
	'WavCaps',
	'AudioCaps',
	]


	asqa_datasets = [
	'Clotho-AQA',
	'WavCaps-QA',
	'AudioCaps-QA'
	]


	er_datasets = [
	'IEMOCAP-Emotion',
	'MELD-Sentiment',
	'MELD-Emotion',
	]


	ar_datasets = [
	'VoxCeleb-Accent',
	'MNSC-AR-Sentence',
	'MNSC-AR-Dialogue',
	]


	gr_datasets = [
	'VoxCeleb-Gender',
	'IEMOCAP-Gender'
	]


	music_datasets = ['MuChoMusic']


	wer_development_datasets = [
	'YouTube ASR: Malay with Malay Prompt',
	'YouTube ASR: Chinese with Chinese Prompt',
	'SEAME-Dev-Mandarin',
	'SEAME-Dev-Singlish',
	]


	non_wer_development_datasets = [
	'YouTube SQA: English with Singapore Content',
	'YouTube SDS: English with Singapore Content',
	'YouTube PQA: English with Singapore Content',
	]


	wer_displayname2datasetname = {
	'LibriSpeech-Clean' : 'librispeech_test_clean',
	'LibriSpeech-Other' : 'librispeech_test_other',
	'CommonVoice-15-EN' : 'common_voice_15_en_test',
	'Peoples-Speech' : 'peoples_speech_test',
	'GigaSpeech-1' : 'gigaspeech_test',
	'Earnings-21' : 'earnings21_test',
	'Earnings-22' : 'earnings22_test',
	'TED-LIUM-3' : 'tedlium3_test',
	'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',

	'MNSC-PART1-ASR' : 'imda_part1_asr_test',
	'MNSC-PART2-ASR' : 'imda_part2_asr_test',
	'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
	'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
	'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
	'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',

	'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
	'CommonVoice-ZH' : 'commonvoice_zh_asr',

	'CommonVoice-17-Indonesian' : 'commonvoice_17_id_asr',
	'CommonVoice-17-Tamil' : 'commonvoice_17_ta_asr',
	'CommonVoice-17-Thai' : 'commonvoice_17_th_asr',
	'CommonVoice-17-Vietnamese' : 'commonvoice_17_vi_asr',
	'GigaSpeech-2-Indonesain' : 'gigaspeech2_id_test',
	'GigaSpeech-2-Thai' : 'gigaspeech2_th_test',
	'GigaSpeech-2-Vietnamese' : 'gigaspeech2_vi_test',
	'Fleurs-Tamil' : 'fleurs_tamil_ta_30_asr',
	'Lotus-Thai' : 'lotus_thai_th_30_asr',

	'CNA' : 'cna_test',
	'IDPC' : 'idpc_test',
	'Parliament' : 'parliament_test',
	'UKUS-News' : 'ukusnews_test',
	'Mediacorp' : 'mediacorp_test',
	'IDPC-Short' : 'idpc_short_test',
	'Parliament-Short': 'parliament_short_test',
	'UKUS-News-Short' : 'ukusnews_short_test',
	'Mediacorp-Short' : 'mediacorp_short_test',

	'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
	'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
	'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
	'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
	'YouTube ASR: Tamil with English Prompt': 'ytb_asr_batch3_tamil',

	'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
	'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',

	'SEAME-Dev-Mandarin' : 'seame_dev_man',
	'SEAME-Dev-Singlish' : 'seame_dev_sge',
	}


	non_wer_displayname2datasetname = {
	'CoVoST2-EN-ID' : 'covost2_en_id_test',
	'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
	'CoVoST2-EN-TA' : 'covost2_en_ta_test',
	'CoVoST2-ID-EN' : 'covost2_id_en_test',
	'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
	'CoVoST2-TA-EN' : 'covost2_ta_en_test',

	'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
	'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
	'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
	'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
	'Spoken-SQuAD' : 'spoken_squad_test',
	'MMAU-mini' : 'mmau_mini',

	'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
	'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
	'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
	'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',

	'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
	'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
	'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
	'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',

	'OpenHermes-Audio' : 'openhermes_audio_test',
	'ALPACA-Audio' : 'alpaca_audio_test',

	'WavCaps' : 'wavcaps_test',
	'AudioCaps' : 'audiocaps_test',

	'Clotho-AQA' : 'clotho_aqa_test',
	'WavCaps-QA' : 'wavcaps_qa_test',
	'AudioCaps-QA' : 'audiocaps_qa_test',

	'IEMOCAP-Emotion' : 'iemocap_emotion_test',
	'MELD-Sentiment' : 'meld_sentiment_test',
	'MELD-Emotion' : 'meld_emotion_test',

	'VoxCeleb-Accent' : 'voxceleb_accent_test',
	'MNSC-AR-Sentence' : 'imda_ar_sentence',
	'MNSC-AR-Dialogue' : 'imda_ar_dialogue',

	'VoxCeleb-Gender' : 'voxceleb_gender_test',
	'IEMOCAP-Gender' : 'iemocap_gender_test',

	'MuChoMusic' : 'muchomusic_test',

	'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
	'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
	'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',

	'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
	'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
	'YouTube SQA: Tamil': 'ytb_sqa_batch3_tamil',

	'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
	'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
	'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',

	'YouTube-TA-En':'ytb_asr_batch3_ta_en',
	'YouTube-ZH-En':'ytb_asr_batch3_zh_en',
	'YouTube-MA-En':'ytb_asr_batch3_ma_en',

	}


	displayname2datasetname = {}
	displayname2datasetname.update(wer_displayname2datasetname)
	displayname2datasetname.update(non_wer_displayname2datasetname)


	datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}


	dataset_diaplay_information = {
	'LibriSpeech-Clean' : 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
	'LibriSpeech-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
	'CommonVoice-15-EN' : 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
	'Peoples-Speech' : 'A large-scale, open-source speech recognition dataset, with diverse accents and domains.',
	'GigaSpeech-1' : 'A large-scale ASR dataset with diverse audio sources like podcasts, interviews, etc.',
	'Earnings-21' : 'ASR test dataset focused on earnings calls from 2021, with professional speech and financial jargon.',
	'Earnings-22' : 'Similar to Earnings21, but covering earnings calls from 2022.',
	'TED-LIUM-3' : 'A test set derived from TED talks, covering diverse speakers and topics.',
	'TED-LIUM-3-LongForm' : 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
	'AISHELL-ASR-ZH' : 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.',
	'CoVoST2-EN-ID' : 'CoVoST 2 dataset for speech translation from English to Indonesian.',
	'CoVoST2-EN-ZH' : 'CoVoST 2 dataset for speech translation from English to Chinese.',
	'CoVoST2-EN-TA' : 'CoVoST 2 dataset for speech translation from English to Tamil.',
	'CoVoST2-ID-EN' : 'CoVoST 2 dataset for speech translation from Indonesian to English.',
	'CoVoST2-ZH-EN' : 'CoVoST 2 dataset for speech translation from Chinese to English.',
	'CoVoST2-TA-EN' : 'CoVoST 2 dataset for speech translation from Tamil to English.',
	'CN-College-Listen-MCQ': 'Chinese College English Listening Test, with multiple-choice questions.',
	'DREAM-TTS-MCQ' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
	'SLUE-P2-SQA5' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
	'Public-SG-Speech-QA' : 'Public dataset for speech-based question answering, gathered from Singapore.',
	'Spoken-SQuAD' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.',
	'OpenHermes-Audio' : 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
	'ALPACA-Audio' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.',
	'WavCaps' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
	'AudioCaps' : 'AudioCaps dataset, used for generating captions from general audio events.',
	'Clotho-AQA' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
	'WavCaps-QA' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
	'AudioCaps-QA' : 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.',
	'VoxCeleb-Accent' : 'Test dataset for accent recognition, based on VoxCeleb, a large speaker identification dataset.',
	'MNSC-AR-Sentence' : 'Accent recognition based on the IMDA NSC dataset, focusing on sentence-level accents.',
	'MNSC-AR-Dialogue' : 'Accent recognition based on the IMDA NSC dataset, focusing on dialogue-level accents.',

	'VoxCeleb-Gender': 'Test dataset for gender classification, also derived from VoxCeleb.',
	'IEMOCAP-Gender' : 'Gender classification based on the IEMOCAP dataset.',
	'IEMOCAP-Emotion': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
	'MELD-Sentiment' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
	'MELD-Emotion' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.',
	'MuChoMusic' : 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.',
	'MNSC-PART1-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 1.',
	'MNSC-PART2-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 2.',
	'MNSC-PART3-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 3.',
	'MNSC-PART4-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 4.',
	'MNSC-PART5-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 5.',
	'MNSC-PART6-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 6.',
	'MNSC-PART3-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
	'MNSC-PART4-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
	'MNSC-PART5-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
	'MNSC-PART6-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
	'MNSC-PART3-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 3.',
	'MNSC-PART4-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 4.',
	'MNSC-PART5-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 5.',
	'MNSC-PART6-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 6.',

	'CNA' : 'Under Development',
	'IDPC' : 'Under Development',
	'Parliament' : 'Under Development',
	'UKUS-News' : 'Under Development',
	'Mediacorp' : 'Under Development',
	'IDPC-Short' : 'Under Development',
	'Parliament-Short': 'Under Development',
	'UKUS-News-Short' : 'Under Development',
	'Mediacorp-Short' : 'Under Development',

	'CommonVoice-ZH' : 'Under Development',
	'CommonVoice-17-Indonesian' : 'Under Development',
	'CommonVoice-17-Tamil' : 'Under Development',
	'CommonVoice-17-Thai' : 'Under Development',
	'CommonVoice-17-Vietnamese' : 'Under Development',
	'GigaSpeech-2-Indonesain' : 'Under Development',
	'GigaSpeech-2-Thai' : 'Under Development',
	'GigaSpeech-2-Vietnamese' : 'Under Development',
	'Fleurs-Tamil' : 'Under Development',
	'Lotus-Thai' : 'Under Development',
	'MMAU-mini' : 'Under Development',


	'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',

	'YouTube ASR: English with Strong Emotion' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English, Singlish and some unknown languages audio clips, featuring speech with strong emotional expression. <br> It includes approximately 3.9 hours of audio, with each clip lasting 30 seconds.',

	'YouTube ASR: Malay with English Prompt': 'YouTube Evaluation Dataset for ASR Task: <br> This dataset mainly contains Malay and some Malay-English codeswitch audio clips, featuring with English prompts. <br> It includes approximately 2.55 hours of audio, with indicidual clips ranging form 30 seconds to 95 seconds in length.',

	'YouTube ASR: Malay with Malay Prompt': 'YouTube Evaluation Dataset for ASR Task: <br> This dataset use the same audio from <i>YouTube ASR: Malay English Prompt</i>, except featuring with Malay prompts. <br> It includes approximately 2.55 hours of audio, with indicidual clips ranging form 30 seconds to 95 seconds in length.',

	'YouTube ASR: Chinese with English Prompt': 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains Chinese and some Chinese-English codeswitch audio clips, featuring with English prompts. <br> It includes approximately 3.32 hours of audio, with individual clips ranging from 17 seconds to 1966 seconds in length.',

	'YouTube ASR: Chinese with Chinese Prompt': 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains Chinese and some Chinese-English codeswitch audio clips, featuring with Chinese prompts. <br> It includes approximately 3.32 hours of audio, with individual clips ranging from 17 seconds to 1966 seconds in length.',

	'YouTube ASR: Tamil with Tamil Prompt': 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains Tamil and some Tamil-English codeswitch audio clips, featuring with Tamil prompts. <br> It includes approximately 2.44 hours of audio, with individual clips ranging from 30 seconds to 324 seconds in length.',

	'YouTube ASR: Tamil with English Prompt': 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains Tamil and some Tamil-English codeswitch audio clips, featuring with English prompts. <br> It includes approximately 2.44 hours of audio, with individual clips ranging from 30 seconds to 324 seconds in length.',

	'YouTube-TA-En':'YouTube Evaluation Dataset for ASR Task: <br> The audio of dataset is same as <i>YouTube ASR: Tamil<i>',
	'YouTube-ZH-En':'YouTube Evaluation Dataset for ASR Task: <br> The audio of dataset is same as <i>YouTube ASR: Chinese<i>',
	'YouTube-MA-En':'YouTube Evaluation Dataset for ASR Task: <br> The audio of dataset is same as <i>YouTube ASR: Malay<i>',

	# 'YouTube ASR Translation: Chinese2English': 'YouTube Evaluation Dataset for ASR Task: <br> The audio of dataset is same as <i>YouTube ASR: Chinese<i>',

	# 'YouTube ASR Translation: Tamil2English': 'YouTube Evaluation Dataset for ASR Task: <br> The audio of dataset is same as <i>YouTube ASR: Tamil<i>',



	'SEAME-Dev-Mandarin' : 'Under Development',
	'SEAME-Dev-Singlish' : 'Under Development',

	'YouTube SQA: English with Singapore Content': 'YouTube Evaluation Dataset for Speech-QA Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 7.6 hours of audio, with individual clips ranging from 8 seconds to 32 seconds in length.',

	'YouTube SQA: Malay': 'YouTube Evaluation Dataset for Speech-QA Task: <br> The auido of this dataset is same as <i>YouTube ASR: Malay<i>, it contains Malay and some Malay-English codeswitch audio clips, featuring with English prompts. <br> It includes approximately 2.55 hours of audio, with indicidual clips ranging form 30 seconds to 95 seconds in length.',

	'YouTube SQA: Chinese': 'YouTube Evaluation Dataset for Speech-QA Task: <br> The auido of this dataset is same as <i>YouTube ASR: Chinese<i>',

	'YouTube SQA: Tamil': 'YouTube Evaluation Dataset for Speech-QA Task: <br> The auido of this dataset is same as <i>YouTube ASR: Tamil<i>',

	'YouTube SDS: English with Singapore Content': 'YouTube Evaluation Dataset for Summary Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 5.4 hours of audio, with individual clips ranging from 8 seconds to 32 seconds in length.',

	'YouTube SDS: Malay': 'YouTube Evaluation Dataset for Speech-QA Task: <br> The auido of this dataset is same as <i>YouTube ASR: Malay<i>, it contains Malay and some Malay-English codeswitch audio clips, featuring with English prompts. <br> It includes approximately 2.55 hours of audio, with indicidual clips ranging form 30 seconds to 95 seconds in length.',

	'YouTube SDS: Chinese': 'YouTube Evaluation Dataset for Speech-QA Task: <br> The auido of this dataset is same as <i>YouTube ASR: Chinese<i>',

	'YouTube SDS: Tamil': 'YouTube Evaluation Dataset for Speech-QA Task: <br> The auido of this dataset is same as <i>YouTube ASR: Tamil<i>',


	'YouTube PQA: English with Singapore Content': 'YouTube Evaluation Dataset for Paralinguistics QA Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 41.4 hours of audio, with individual clips ranging from 41 seconds to 83 seconds in length.',


	}


	metrics_info = {
	'wer' : 'Word Error Rate (WER) - The Lower, the better.',
	'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
	'llama3_70b_judge' : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
	'meteor' : 'METEOR Score. The higher, the better.',
	'bleu' : 'BLEU Score. The higher, the better.',
	}