NeMo / examples /tts /conf /tacotron2.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# This config contains the default values for training Tacotron2 model on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.
name: Tacotron2
train_dataset: ???
validation_datasets: ???
sup_data_path: null
sup_data_types: null
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
model:
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789
sample_rate: 22050
n_mel_channels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: hann
pad_value: -11.52
text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
text_normalizer_call_kwargs:
verbose: false
punct_pre_process: true
punct_post_process: true
text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
punct: true
stresses: true
chars: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.modules.EnglishG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
train_ds:
dataset:
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
manifest_filepath: ${train_dataset}
sample_rate: ${model.sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${model.n_fft}
win_length: ${model.n_window_size}
hop_length: ${model.n_window_stride}
window: ${model.window}
n_mels: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${model.pitch_fmin}
pitch_fmax: ${model.pitch_fmax}
dataloader_params:
drop_last: false
shuffle: true
batch_size: 48
num_workers: 4
pin_memory: true
validation_ds:
dataset:
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
manifest_filepath: ${validation_datasets}
sample_rate: ${model.sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${model.n_fft}
win_length: ${model.n_window_size}
hop_length: ${model.n_window_stride}
window: ${model.window}
n_mels: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${model.pitch_fmin}
pitch_fmax: ${model.pitch_fmax}
dataloader_params:
drop_last: false
shuffle: false
batch_size: 24
num_workers: 8
pin_memory: true
preprocessor:
_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
nfilt: ${model.n_mel_channels}
highfreq: ${model.highfreq}
log: true
log_zero_guard_type: clamp
log_zero_guard_value: 1e-05
lowfreq: ${model.lowfreq}
n_fft: ${model.n_fft}
n_window_size: ${model.n_window_size}
n_window_stride: ${model.n_window_stride}
pad_to: 16
pad_value: ${model.pad_value}
sample_rate: ${model.sample_rate}
window: ${model.window}
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
stft_conv: false
nb_augmentation_prob : 0
mag_power: 1.0
exact_pad: true
use_grads: false
encoder:
_target_: nemo.collections.tts.modules.tacotron2.Encoder
encoder_kernel_size: 5
encoder_n_convolutions: 3
encoder_embedding_dim: 512
decoder:
_target_: nemo.collections.tts.modules.tacotron2.Decoder
decoder_rnn_dim: 1024
encoder_embedding_dim: ${model.encoder.encoder_embedding_dim}
gate_threshold: 0.5
max_decoder_steps: 1000
n_frames_per_step: 1 # currently only 1 is supported
n_mel_channels: ${model.n_mel_channels}
p_attention_dropout: 0.1
p_decoder_dropout: 0.1
prenet_dim: 256
prenet_p_dropout: 0.5
# Attention parameters
attention_dim: 128
attention_rnn_dim: 1024
# AttentionLocation Layer parameters
attention_location_kernel_size: 31
attention_location_n_filters: 32
early_stopping: true
postnet:
_target_: nemo.collections.tts.modules.tacotron2.Postnet
n_mel_channels: ${model.n_mel_channels}
p_dropout: 0.5
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
optim:
name: adam
lr: 1e-3
weight_decay: 1e-6
# scheduler setup
sched:
name: CosineAnnealing
min_lr: 1e-5
trainer:
devices: 1 # number of gpus
max_epochs: ???
num_nodes: 1
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: False # Provided by exp_manager
logger: False # Provided by exp_manager
gradient_clip_val: 1.0
log_every_n_steps: 60
check_val_every_n_epoch: 2
benchmark: false
exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_loss
mode: min