NeMo / examples /tts /conf /tacotron2.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

5.33 kB

	# This config contains the default values for training Tacotron2 model on LJSpeech dataset.
	# If you want to train model on other dataset, you can change config values according to your dataset.
	# Most dataset-specific arguments are in the head of the config file, see below.

	name: Tacotron2

	train_dataset: ???
	validation_datasets: ???
	sup_data_path: null
	sup_data_types: null

	phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
	heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

	model:
	pitch_fmin: 65.40639132514966
	pitch_fmax: 2093.004522404789

	sample_rate: 22050
	n_mel_channels: 80
	n_window_size: 1024
	n_window_stride: 256
	n_fft: 1024
	lowfreq: 0
	highfreq: 8000
	window: hann
	pad_value: -11.52

	text_normalizer:
	_target_: nemo_text_processing.text_normalization.normalize.Normalizer
	lang: en
	input_case: cased

	text_normalizer_call_kwargs:
	verbose: false
	punct_pre_process: true
	punct_post_process: true

	text_tokenizer:
	_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
	punct: true
	stresses: true
	chars: true
	apostrophe: true
	pad_with_space: true
	g2p:
	_target_: nemo.collections.tts.g2p.modules.EnglishG2p
	phoneme_dict: ${phoneme_dict_path}
	heteronyms: ${heteronyms_path}

	train_ds:
	dataset:
	_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
	manifest_filepath: ${train_dataset}
	sample_rate: ${model.sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${model.n_fft}
	win_length: ${model.n_window_size}
	hop_length: ${model.n_window_stride}
	window: ${model.window}
	n_mels: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: False
	pitch_fmin: ${model.pitch_fmin}
	pitch_fmax: ${model.pitch_fmax}
	dataloader_params:
	drop_last: false
	shuffle: true
	batch_size: 48
	num_workers: 4
	pin_memory: true

	validation_ds:
	dataset:
	_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
	manifest_filepath: ${validation_datasets}
	sample_rate: ${model.sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${model.n_fft}
	win_length: ${model.n_window_size}
	hop_length: ${model.n_window_stride}
	window: ${model.window}
	n_mels: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: False
	pitch_fmin: ${model.pitch_fmin}
	pitch_fmax: ${model.pitch_fmax}
	dataloader_params:
	drop_last: false
	shuffle: false
	batch_size: 24
	num_workers: 8
	pin_memory: true

	preprocessor:
	_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
	nfilt: ${model.n_mel_channels}
	highfreq: ${model.highfreq}
	log: true
	log_zero_guard_type: clamp
	log_zero_guard_value: 1e-05
	lowfreq: ${model.lowfreq}
	n_fft: ${model.n_fft}
	n_window_size: ${model.n_window_size}
	n_window_stride: ${model.n_window_stride}
	pad_to: 16
	pad_value: ${model.pad_value}
	sample_rate: ${model.sample_rate}
	window: ${model.window}
	normalize: null
	preemph: null
	dither: 0.0
	frame_splicing: 1
	stft_conv: false
	nb_augmentation_prob : 0
	mag_power: 1.0
	exact_pad: true
	use_grads: false

	encoder:
	_target_: nemo.collections.tts.modules.tacotron2.Encoder
	encoder_kernel_size: 5
	encoder_n_convolutions: 3
	encoder_embedding_dim: 512

	decoder:
	_target_: nemo.collections.tts.modules.tacotron2.Decoder
	decoder_rnn_dim: 1024
	encoder_embedding_dim: ${model.encoder.encoder_embedding_dim}
	gate_threshold: 0.5
	max_decoder_steps: 1000
	n_frames_per_step: 1 # currently only 1 is supported
	n_mel_channels: ${model.n_mel_channels}
	p_attention_dropout: 0.1
	p_decoder_dropout: 0.1
	prenet_dim: 256
	prenet_p_dropout: 0.5
	# Attention parameters
	attention_dim: 128
	attention_rnn_dim: 1024
	# AttentionLocation Layer parameters
	attention_location_kernel_size: 31
	attention_location_n_filters: 32
	early_stopping: true

	postnet:
	_target_: nemo.collections.tts.modules.tacotron2.Postnet
	n_mel_channels: ${model.n_mel_channels}
	p_dropout: 0.5
	postnet_embedding_dim: 512
	postnet_kernel_size: 5
	postnet_n_convolutions: 5

	optim:
	name: adam
	lr: 1e-3
	weight_decay: 1e-6

	# scheduler setup
	sched:
	name: CosineAnnealing
	min_lr: 1e-5

	trainer:
	devices: 1 # number of gpus
	max_epochs: ???
	num_nodes: 1
	accelerator: gpu
	strategy: ddp
	accumulate_grad_batches: 1
	enable_checkpointing: False # Provided by exp_manager
	logger: False # Provided by exp_manager
	gradient_clip_val: 1.0
	log_every_n_steps: 60
	check_val_every_n_epoch: 2
	benchmark: false

	exp_manager:
	exp_dir: null
	name: ${name}
	create_tensorboard_logger: true
	create_checkpoint_callback: true
	checkpoint_callback_params:
	monitor: val_loss
	mode: min