name: RadTTS sample_rate: 22050 train_dataset: ??? validation_datasets: ??? ckpt_path: None export_dir: ??? sup_data_path: ??? sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] # these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values # by running `scripts/dataset_processing/tts/extract_sup_data.py` pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech # default values from librosa.pyin pitch_fmin: 65.40639132514966 pitch_fmax: 2093.004522404789 # default values for sample_rate=22050 n_mels: 80 n_window_size: 1024 n_window_stride: 256 n_fft: 1024 lowfreq: 0 highfreq: 8000 window: "hann" phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt" heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" mapping_file_path: "" model: target: nemo.collections.tts.models.RadTTSModel bin_loss_start_ratio: 0.2 bin_loss_warmup_epochs: 100 symbols_embedding_dim: 384 n_mel_channels: ${n_mels} pitch_mean: ${pitch_mean} pitch_std: ${pitch_std} text_normalizer: _target_: nemo_text_processing.text_normalization.normalize.Normalizer lang: en input_case: cased text_normalizer_call_kwargs: verbose: false punct_pre_process: true punct_post_process: true text_tokenizer: _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer punct: true apostrophe: true pad_with_space: true g2p: _target_: nemo.collections.tts.g2p.modules.IPAG2P phoneme_dict: ${phoneme_dict_path} heteronyms: ${heteronyms_path} phoneme_probability: 0.5 # Relies on the heteronyms list for anything that needs to be disambiguated ignore_ambiguous_words: true use_chars: true use_stresses: true train_ds: dataset: _target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" manifest_filepath: ${train_dataset} sample_rate: ${sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} n_fft: ${n_fft} win_length: ${n_window_size} hop_length: ${n_window_stride} window: ${window} n_mels: ${n_mels} lowfreq: ${lowfreq} highfreq: ${highfreq} max_duration: null min_duration: 0.1 ignore_file: null trim: False pitch_fmin: ${pitch_fmin} pitch_fmax: ${pitch_fmax} text_tokenizer: _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" punct: True stresses: True chars: True space: ' ' silence: null apostrophe: True sep: '|' add_blank_at: null pad_with_space: True g2p: _target_: "nemo.collections.tts.g2p.modules.EnglishG2p" phoneme_dict: ${phoneme_dict_path} heteronyms: ${heteronyms_path} phoneme_probability: 0.5 dataloader_params: drop_last: false shuffle: true batch_size: 8 num_workers: 8 pin_memory: false validation_ds: dataset: _target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" manifest_filepath: ${validation_datasets} sample_rate: ${sample_rate} sup_data_path: ${sup_data_path} sup_data_types: ${sup_data_types} n_fft: ${n_fft} win_length: ${n_window_size} hop_length: ${n_window_stride} window: ${window} n_mels: ${n_mels} lowfreq: ${lowfreq} highfreq: ${highfreq} max_duration: null min_duration: 0.1 ignore_file: null trim: False pitch_fmin: ${pitch_fmin} pitch_fmax: ${pitch_fmax} text_tokenizer: _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" punct: True stresses: True chars: True space: ' ' silence: null apostrophe: True sep: '|' add_blank_at: null pad_with_space: True g2p: _target_: "nemo.collections.tts.g2p.modules.EnglishG2p" phoneme_dict: ${phoneme_dict_path} heteronyms: ${heteronyms_path} phoneme_probability: 0.5 dataloader_params: drop_last: false shuffle: false batch_size: 8 num_workers: 8 pin_memory: false optim: name: RAdam lr: 0.0001 betas: [0.9, 0.98] weight_decay: 0.000001 sched: name: exp_decay warmup_steps: 40000 last_epoch: -1 d_model: 1 # Disable scaling based on model dim trainerConfig: sigma: 1 iters_per_checkpoint: 3000 seed: null ignore_layers: [] finetune_layers: [] include_layers: [] with_tensorboard: true dur_loss_weight: 1 ctc_loss_weight: 1 mask_unvoiced_f0: false log_step: 1 binarization_start_iter: 6000 kl_loss_start_iter: 18000 loss_weights: ctc_loss_weight: 0.1 dur_loss_weight: 1.0 f0_loss_weight: 1.0 energy_loss_weight: 1.0 vpred_loss_weight: 1.0 unfreeze_modules: "all" load_from_checkpoint: False init_from_ptl_ckpt: ${ckpt_path} modelConfig: _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" n_speakers: 1 n_speaker_dim: 16 n_text: 384 #185 n_text_dim: 512 n_flows: 8 n_conv_layers_per_step: 4 n_mel_channels: 80 n_hidden: 1024 mel_encoder_n_hidden: 512 dummy_speaker_embedding: false n_early_size: 2 n_early_every: 2 n_group_size: 2 affine_model: wavenet include_modules: "decatnvpred" scaling_fn: tanh matrix_decomposition: LUS learn_alignments: true use_context_lstm: true context_lstm_norm: spectral context_lstm_w_f0_and_energy: true text_encoder_lstm_norm: spectral n_f0_dims: 1 n_energy_avg_dims: 1 use_first_order_features: false unvoiced_bias_activation: "relu" decoder_use_partial_padding: false decoder_use_unvoiced_bias: true ap_pred_log_f0: true ap_use_unvoiced_bias: true ap_use_voiced_embeddings: true dur_model_config: null f0_model_config: null energy_model_config: null v_model_config : name : dap hparams : n_speaker_dim : 16 take_log_of_input: false bottleneck_hparams: in_dim: 512 reduction_factor: 16 norm: weightnorm non_linearity: relu arch_hparams: out_dim: 1 n_layers: 2 n_channels: 256 kernel_size: 3 p_dropout: 0.5 trainer: devices: 8 precision: 16 max_epochs: 1000 num_nodes: 1 accelerator: gpu strategy: ddp accumulate_grad_batches: 1 enable_checkpointing: False logger: False gradient_clip_val: 1 log_every_n_steps: 100 check_val_every_n_epoch: 5 exp_manager: exp_dir: ${export_dir} name: ${name} create_tensorboard_logger: True create_checkpoint_callback: True checkpoint_callback_params: monitor: val/loss_ctc mode: min filepath: ${export_dir} filename: model_checkpoint