|
@inproceedings{shen2018natural, |
|
title={Natural tts synthesis by conditioning wavenet on mel spectrogram predictions}, |
|
author={Shen, Jonathan and Pang, Ruoming and Weiss, Ron J and Schuster, Mike and Jaitly, Navdeep and Yang, Zongheng and Chen, Zhifeng and Zhang, Yu and Wang, Yuxuan and Skerrv-Ryan, Rj and others}, |
|
booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)}, |
|
pages={4779--4783}, |
|
year={2018}, |
|
organization={IEEE} |
|
} |
|
|
|
@inproceedings{lancucki2021fastpitch, |
|
title={Fastpitch: Parallel text-to-speech with pitch prediction}, |
|
author={{\L}a{\'n}cucki, Adrian}, |
|
booktitle={ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
|
pages={6588--6592}, |
|
year={2021}, |
|
organization={IEEE} |
|
} |
|
|
|
@inproceedings{tatanov2022mixer, |
|
title={Mixer-TTS: non-autoregressive, fast and compact text-to-speech model conditioned on language model embeddings}, |
|
author={Tatanov, Oktai and Beliaev, Stanislav and Ginsburg, Boris}, |
|
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
|
pages={7482--7486}, |
|
year={2022}, |
|
organization={IEEE} |
|
} |
|
|
|
@inproceedings{shih2021rad, |
|
title={RAD-TTS: Parallel flow-based TTS with robust alignment learning and diverse synthesis}, |
|
author={Shih, Kevin J and Valle, Rafael and Badlani, Rohan and Lancucki, Adrian and Ping, Wei and Catanzaro, Bryan}, |
|
booktitle={ICML Workshop on Invertible Neural Networks, Normalizing Flows, and Explicit Likelihood Models}, |
|
year={2021} |
|
} |
|
|
|
@article{kong2020hifi, |
|
title={Hifi-gan: Generative adversarial networks for efficient and high fidelity speech synthesis}, |
|
author={Kong, Jungil and Kim, Jaehyeon and Bae, Jaekyoung}, |
|
journal={Advances in Neural Information Processing Systems}, |
|
volume={33}, |
|
pages={17022--17033}, |
|
year={2020} |
|
} |
|
|
|
@inproceedings{prenger2019waveglow, |
|
title={Waveglow: A flow-based generative network for speech synthesis}, |
|
author={Prenger, Ryan and Valle, Rafael and Catanzaro, Bryan}, |
|
booktitle={ICASSP 2019-2019 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
|
pages={3617--3621}, |
|
year={2019}, |
|
organization={IEEE} |
|
} |
|
|
|
@inproceedings{jang21_interspeech, |
|
author={Won Jang and Dan Lim and Jaesam Yoon and Bongwan Kim and Juntae Kim}, |
|
title={{UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation}}, |
|
year=2021, |
|
booktitle={Proc. Interspeech 2021}, |
|
pages={2207--2211}, |
|
doi={10.21437/Interspeech.2021-1016} |
|
} |
|
|
|
@inproceedings{badlani2022one, |
|
title={One TTS alignment to rule them all}, |
|
author={Badlani, Rohan and {\L}a{\'n}cucki, Adrian and Shih, Kevin J and Valle, Rafael and Ping, Wei and Catanzaro, Bryan}, |
|
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, |
|
pages={6092--6096}, |
|
year={2022}, |
|
organization={IEEE} |
|
} |
|
|
|
@article{xue2021byt5, |
|
title={ByT5: Towards a token-free future with pre-trained byte-to-byte models 2021}, |
|
author={Xue, Linting and Barua, Aditya and Constant, Noah and Al-Rfou, Rami and Narang, Sharan and Kale, Mihir and Roberts, Adam and Raffel, Colin}, |
|
journal={arXiv preprint arXiv:2105.13626}, |
|
year={2021} |
|
} |
|
|
|
@article{vrezavckova2021t5g2p, |
|
title={T5g2p: Using text-to-text transfer transformer for grapheme-to-phoneme conversion}, |
|
author={{\v{R}}ez{\'a}{\v{c}}kov{\'a}, Mark{\'e}ta and {\v{S}}vec, Jan and Tihelka, Daniel}, |
|
year={2021}, |
|
journal={International Speech Communication Association} |
|
} |
|
|
|
@article{zhu2022byt5, |
|
title={ByT5 model for massively multilingual grapheme-to-phoneme conversion}, |
|
author={Zhu, Jian and Zhang, Cong and Jurgens, David}, |
|
journal={arXiv preprint arXiv:2204.03067}, |
|
year={2022} |
|
} |
|
|
|
@article{ggulati2020conformer, |
|
title={Conformer: Convolution-augmented transformer for speech recognition}, |
|
author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and others}, |
|
journal={arXiv preprint arXiv:2005.08100}, |
|
year={2020} |
|
} |
|
|
|
@inproceedings{gorman2018improving, |
|
title={Improving homograph disambiguation with supervised machine learning}, |
|
author={Gorman, Kyle and Mazovetskiy, Gleb and Nikolaev, Vitaly}, |
|
booktitle={Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)}, |
|
year={2018} |
|
} |
|
@inproceedings{kim2021conditional, |
|
title={Conditional variational autoencoder with adversarial learning for end-to-end text-to-speech}, |
|
author={Kim, Jaehyeon and Kong, Jungil and Son, Juhee}, |
|
booktitle={International Conference on Machine Learning}, |
|
pages={5530--5540}, |
|
year={2021}, |
|
organization={PMLR} |
|
} |