NeMo / examples /tts /conf /fastpitch_align_ipa.yaml

thanks to NVIDIA ❤

7934b29 about 2 years ago

6.78 kB

	# This config contains the default values for training a FastPitch model with aligner.
	# If you want to train a model on other dataset, you can change config values according to your dataset.
	# Most dataset-specific arguments are in the head of the config file, see below.

	name: FastPitch

	train_dataset: ???
	validation_datasets: ???
	sup_data_path: ???
	sup_data_types: [ "align_prior_matrix", "pitch" ]

	# Default values from librosa.pyin
	pitch_fmin: 65.40639132514966
	pitch_fmax: 2093.004522404789

	# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
	# by running `scripts/dataset_processing/tts/extract_sup_data.py`
	pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
	pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

	# Default values for dataset with sample_rate=22050
	sample_rate: 22050
	n_mel_channels: 80
	n_window_size: 1024
	n_window_stride: 256
	n_fft: 1024
	lowfreq: 0
	highfreq: 8000
	window: hann

	phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
	heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

	model:
	learn_alignment: true
	bin_loss_warmup_epochs: 100

	n_speakers: 1
	max_token_duration: 75
	symbols_embedding_dim: 384
	pitch_embedding_kernel_size: 3

	pitch_fmin: ${pitch_fmin}
	pitch_fmax: ${pitch_fmax}

	pitch_mean: ${pitch_mean}
	pitch_std: ${pitch_std}

	sample_rate: ${sample_rate}
	n_mel_channels: ${n_mel_channels}
	n_window_size: ${n_window_size}
	n_window_stride: ${n_window_stride}
	n_fft: ${n_fft}
	lowfreq: ${lowfreq}
	highfreq: ${highfreq}
	window: ${window}

	text_normalizer:
	_target_: nemo_text_processing.text_normalization.normalize.Normalizer
	lang: en
	input_case: cased

	text_normalizer_call_kwargs:
	verbose: false
	punct_pre_process: true
	punct_post_process: true

	text_tokenizer:
	_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
	punct: true
	apostrophe: true
	pad_with_space: true
	g2p:
	_target_: nemo.collections.tts.g2p.modules.IPAG2P
	phoneme_dict: ${phoneme_dict_path}
	heteronyms: ${heteronyms_path}
	phoneme_probability: 0.8
	# Relies on the heteronyms list for anything that needs to be disambiguated
	ignore_ambiguous_words: false
	use_chars: true
	use_stresses: true
	train_ds:
	dataset:
	_target_: nemo.collections.tts.data.tts_dataset.TTSDataset
	manifest_filepath: ${train_dataset}
	sample_rate: ${model.sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${model.n_fft}
	win_length: ${model.n_window_size}
	hop_length: ${model.n_window_stride}
	window: ${model.window}
	n_mels: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: false
	pitch_fmin: ${model.pitch_fmin}
	pitch_fmax: ${model.pitch_fmax}
	pitch_norm: true
	pitch_mean: ${model.pitch_mean}
	pitch_std: ${model.pitch_std}
	use_beta_binomial_interpolator: true

	dataloader_params:
	drop_last: false
	shuffle: true
	batch_size: 32
	num_workers: 12

	validation_ds:
	dataset:
	_target_: nemo.collections.tts.data.tts_dataset.TTSDataset
	manifest_filepath: ${validation_datasets}
	sample_rate: ${model.sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${model.n_fft}
	win_length: ${model.n_window_size}
	hop_length: ${model.n_window_stride}
	window: ${model.window}
	n_mels: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	max_duration: null
	min_duration: null
	ignore_file: null
	trim: false
	pitch_fmin: ${model.pitch_fmin}
	pitch_fmax: ${model.pitch_fmax}
	pitch_norm: true
	pitch_mean: ${model.pitch_mean}
	pitch_std: ${model.pitch_std}
	use_beta_binomial_interpolator: true

	dataloader_params:
	drop_last: false
	shuffle: false
	batch_size: 32
	num_workers: 8

	preprocessor:
	_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
	features: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	n_fft: ${model.n_fft}
	n_window_size: ${model.n_window_size}
	window_size: false
	n_window_stride: ${model.n_window_stride}
	window_stride: false
	pad_to: 1
	pad_value: 0
	sample_rate: ${model.sample_rate}
	window: ${model.window}
	normalize: null
	preemph: null
	dither: 0.0
	frame_splicing: 1
	log: true
	log_zero_guard_type: add
	log_zero_guard_value: 1e-05
	mag_power: 1.0

	input_fft: #n_embed and padding_idx are added by the model
	_target_: nemo.collections.tts.modules.transformer.FFTransformerEncoder
	n_layer: 6
	n_head: 1
	d_model: ${model.symbols_embedding_dim}
	d_head: 64
	d_inner: 1536
	kernel_size: 3
	dropout: 0.1
	dropatt: 0.1
	dropemb: 0.0
	d_embed: ${model.symbols_embedding_dim}

	output_fft:
	_target_: nemo.collections.tts.modules.transformer.FFTransformerDecoder
	n_layer: 6
	n_head: 1
	d_model: ${model.symbols_embedding_dim}
	d_head: 64
	d_inner: 1536
	kernel_size: 3
	dropout: 0.1
	dropatt: 0.1
	dropemb: 0.0

	alignment_module:
	_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
	n_text_channels: ${model.symbols_embedding_dim}

	duration_predictor:
	_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
	input_size: ${model.symbols_embedding_dim}
	kernel_size: 3
	filter_size: 256
	dropout: 0.1
	n_layers: 2

	pitch_predictor:
	_target_: nemo.collections.tts.modules.fastpitch.TemporalPredictor
	input_size: ${model.symbols_embedding_dim}
	kernel_size: 3
	filter_size: 256
	dropout: 0.1
	n_layers: 2

	optim:
	name: adamw
	lr: 1e-3
	betas: [0.9, 0.999]
	weight_decay: 1e-6

	sched:
	name: NoamAnnealing
	warmup_steps: 1000
	last_epoch: -1
	d_model: 1 # Disable scaling based on model dim

	trainer:
	num_nodes: 1
	devices: 1
	accelerator: gpu
	strategy: ddp
	precision: 32
	max_epochs: 1000
	accumulate_grad_batches: 1
	gradient_clip_val: 1000.0
	enable_checkpointing: False # Provided by exp_manager
	logger: false # Provided by exp_manager
	log_every_n_steps: 100
	check_val_every_n_epoch: 5
	benchmark: false

	exp_manager:
	exp_dir: null
	name: ${name}
	create_tensorboard_logger: true
	create_checkpoint_callback: true
	checkpoint_callback_params:
	monitor: val_loss
	resume_if_exists: false
	resume_ignore_no_checkpoint: false