|
|
|
|
|
|
|
|
|
name: Tacotron2 |
|
|
|
train_dataset: ??? |
|
validation_datasets: ??? |
|
sup_data_path: null |
|
sup_data_types: null |
|
|
|
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10" |
|
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" |
|
|
|
model: |
|
pitch_fmin: 65.40639132514966 |
|
pitch_fmax: 2093.004522404789 |
|
|
|
sample_rate: 22050 |
|
n_mel_channels: 80 |
|
n_window_size: 1024 |
|
n_window_stride: 256 |
|
n_fft: 1024 |
|
lowfreq: 0 |
|
highfreq: 8000 |
|
window: hann |
|
pad_value: -11.52 |
|
|
|
text_normalizer: |
|
_target_: nemo_text_processing.text_normalization.normalize.Normalizer |
|
lang: en |
|
input_case: cased |
|
|
|
text_normalizer_call_kwargs: |
|
verbose: false |
|
punct_pre_process: true |
|
punct_post_process: true |
|
|
|
text_tokenizer: |
|
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer |
|
punct: true |
|
stresses: true |
|
chars: true |
|
apostrophe: true |
|
pad_with_space: true |
|
g2p: |
|
_target_: nemo.collections.tts.g2p.modules.EnglishG2p |
|
phoneme_dict: ${phoneme_dict_path} |
|
heteronyms: ${heteronyms_path} |
|
|
|
train_ds: |
|
dataset: |
|
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" |
|
manifest_filepath: ${train_dataset} |
|
sample_rate: ${model.sample_rate} |
|
sup_data_path: ${sup_data_path} |
|
sup_data_types: ${sup_data_types} |
|
n_fft: ${model.n_fft} |
|
win_length: ${model.n_window_size} |
|
hop_length: ${model.n_window_stride} |
|
window: ${model.window} |
|
n_mels: ${model.n_mel_channels} |
|
lowfreq: ${model.lowfreq} |
|
highfreq: ${model.highfreq} |
|
max_duration: null |
|
min_duration: 0.1 |
|
ignore_file: null |
|
trim: False |
|
pitch_fmin: ${model.pitch_fmin} |
|
pitch_fmax: ${model.pitch_fmax} |
|
dataloader_params: |
|
drop_last: false |
|
shuffle: true |
|
batch_size: 48 |
|
num_workers: 4 |
|
pin_memory: true |
|
|
|
validation_ds: |
|
dataset: |
|
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" |
|
manifest_filepath: ${validation_datasets} |
|
sample_rate: ${model.sample_rate} |
|
sup_data_path: ${sup_data_path} |
|
sup_data_types: ${sup_data_types} |
|
n_fft: ${model.n_fft} |
|
win_length: ${model.n_window_size} |
|
hop_length: ${model.n_window_stride} |
|
window: ${model.window} |
|
n_mels: ${model.n_mel_channels} |
|
lowfreq: ${model.lowfreq} |
|
highfreq: ${model.highfreq} |
|
max_duration: null |
|
min_duration: 0.1 |
|
ignore_file: null |
|
trim: False |
|
pitch_fmin: ${model.pitch_fmin} |
|
pitch_fmax: ${model.pitch_fmax} |
|
dataloader_params: |
|
drop_last: false |
|
shuffle: false |
|
batch_size: 24 |
|
num_workers: 8 |
|
pin_memory: true |
|
|
|
preprocessor: |
|
_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures |
|
nfilt: ${model.n_mel_channels} |
|
highfreq: ${model.highfreq} |
|
log: true |
|
log_zero_guard_type: clamp |
|
log_zero_guard_value: 1e-05 |
|
lowfreq: ${model.lowfreq} |
|
n_fft: ${model.n_fft} |
|
n_window_size: ${model.n_window_size} |
|
n_window_stride: ${model.n_window_stride} |
|
pad_to: 16 |
|
pad_value: ${model.pad_value} |
|
sample_rate: ${model.sample_rate} |
|
window: ${model.window} |
|
normalize: null |
|
preemph: null |
|
dither: 0.0 |
|
frame_splicing: 1 |
|
stft_conv: false |
|
nb_augmentation_prob : 0 |
|
mag_power: 1.0 |
|
exact_pad: true |
|
use_grads: false |
|
|
|
encoder: |
|
_target_: nemo.collections.tts.modules.tacotron2.Encoder |
|
encoder_kernel_size: 5 |
|
encoder_n_convolutions: 3 |
|
encoder_embedding_dim: 512 |
|
|
|
decoder: |
|
_target_: nemo.collections.tts.modules.tacotron2.Decoder |
|
decoder_rnn_dim: 1024 |
|
encoder_embedding_dim: ${model.encoder.encoder_embedding_dim} |
|
gate_threshold: 0.5 |
|
max_decoder_steps: 1000 |
|
n_frames_per_step: 1 |
|
n_mel_channels: ${model.n_mel_channels} |
|
p_attention_dropout: 0.1 |
|
p_decoder_dropout: 0.1 |
|
prenet_dim: 256 |
|
prenet_p_dropout: 0.5 |
|
|
|
attention_dim: 128 |
|
attention_rnn_dim: 1024 |
|
|
|
attention_location_kernel_size: 31 |
|
attention_location_n_filters: 32 |
|
early_stopping: true |
|
|
|
postnet: |
|
_target_: nemo.collections.tts.modules.tacotron2.Postnet |
|
n_mel_channels: ${model.n_mel_channels} |
|
p_dropout: 0.5 |
|
postnet_embedding_dim: 512 |
|
postnet_kernel_size: 5 |
|
postnet_n_convolutions: 5 |
|
|
|
optim: |
|
name: adam |
|
lr: 1e-3 |
|
weight_decay: 1e-6 |
|
|
|
|
|
sched: |
|
name: CosineAnnealing |
|
min_lr: 1e-5 |
|
|
|
trainer: |
|
devices: 1 |
|
max_epochs: ??? |
|
num_nodes: 1 |
|
accelerator: gpu |
|
strategy: ddp |
|
accumulate_grad_batches: 1 |
|
enable_checkpointing: False |
|
logger: False |
|
gradient_clip_val: 1.0 |
|
log_every_n_steps: 60 |
|
check_val_every_n_epoch: 2 |
|
benchmark: false |
|
|
|
exp_manager: |
|
exp_dir: null |
|
name: ${name} |
|
create_tensorboard_logger: true |
|
create_checkpoint_callback: true |
|
checkpoint_callback_params: |
|
monitor: val_loss |
|
mode: min |
|
|