NeMo / examples /asr /conf /ssl /conformer /conformer_ssl.yaml

thanks to NVIDIA ❤

7934b29 over 2 years ago

8.9 kB

	# This config contains the default values for self-supervised pre-training of a Conformer ASR model, large size (~120M).

	# Architecture and training config:
	# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
	# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
	# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file.
	# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one.
	#
	# +-------------+---------+---------+----------+------------+-----+
	# \| Model \| d_model \| n_heads \| n_layers \| time_masks \| lr \|
	# +=============+=========+========+===========+============+=====+
	# \| Small (13M)\| 176 \| 4 \| 16 \| 5 \| 5.0 \|
	# +-------------+---------+--------+-----------+------------+-----+
	# \| Medium (30M)\| 256 \| 4 \| 18 \| 5 \| 5.0 \|
	# +-------------+---------+--------+-----------+------------+-----+
	# \| Large (121M)\| 512 \| 8 \| 18 \| 10 \| 2.0 \|
	# +---------------------------------------------------------------+
	#
	# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2
	# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence.
	# With weight_decay=0.0, learning rate may need to get reduced to 2.0.

	name: "Conformer-SSL"

	model:
	sample_rate: 16000

	train_ds:
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	batch_size: 16 # you may increase batch_size if your memory allows
	shuffle: true
	num_workers: 8
	pin_memory: false
	use_start_end_token: true
	trim_silence: false
	max_duration: 16.7
	min_duration: 8.0
	# tarred datasets
	is_tarred: false
	tarred_audio_filepaths: null
	shuffle_n: 2048
	# bucketing params
	bucketing_strategy: "synced_randomized"
	bucketing_batch_size: null

	validation_ds:
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	batch_size: 16 # you may increase batch_size if your memory allows
	shuffle: false
	num_workers: 8
	pin_memory: true
	use_start_end_token: false
	min_duration: 8.0


	preprocessor:
	_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
	sample_rate: ${model.sample_rate}
	normalize: "per_feature"
	window_size: 0.025
	window_stride: 0.01
	window: "hann"
	features: 80
	n_fft: 512
	log: true
	frame_splicing: 1
	dither: 0.00001
	pad_to: 16
	pad_value: 0.0

	spec_augment:
	_target_: nemo.collections.asr.modules.MaskedPatchAugmentation
	freq_masks: 3
	freq_width: 20
	patch_size: 48
	mask_patches: 0.5

	encoder:
	_target_: nemo.collections.asr.modules.ConformerEncoder
	feat_in: ${model.preprocessor.features}
	feat_out: -1 # you may set it if you need different output size other than the default d_model
	n_layers: 18
	d_model: 512

	# Sub-sampling params
	subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
	subsampling_factor: 4 # must be power of 2 for striding and vggnet
	subsampling_conv_channels: -1 # -1 sets it to d_model

	# Feed forward module's params
	ff_expansion_factor: 4

	# Multi-headed Attention Module's params
	self_attention_model: rel_pos # rel_pos or abs_pos
	n_heads: 8 # may need to be lower for smaller d_models
	# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
	att_context_size: [-1, -1] # -1 means unlimited context
	xscaling: true # scales up the input embeddings by sqrt(d_model)
	untie_biases: true # unties the biases of the TransformerXL layers
	pos_emb_max_len: 5000

	# Convolution module's params
	conv_kernel_size: 31
	conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)

	### regularization
	dropout: 0.1 # The dropout used in most of the Conformer Modules
	dropout_pre_encoder: 0.1 # The dropout used before the encoder
	dropout_emb: 0.0 # The dropout used for embeddings
	dropout_att: 0.1 # The dropout for multi-headed attention modules

	decoder_out: 128

	loss_list:
	contrastive:
	decoder:
	_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
	feat_in: ${model.encoder.d_model}
	feat_hidden: 128
	# features in hidden layer of decoder
	feat_out: ${model.decoder_out}
	stride_layers: 0
	# if loss.combine_time_steps is less than the encoder stride, then a corresponding amount of stride_layers needs to
	# be added to the decoder (here stride and combine_time_steps are both 4)
	non_stride_layers: 0
	loss:
	_target_: nemo.collections.asr.losses.ContrastiveLoss
	in_dim: ${model.preprocessor.features}
	proj_dim: ${model.decoder_out}
	combine_time_steps: 4 # how many spectrogram time steps are used for one target/representation for contrastive task
	quantized_targets: true # should quantizer or linear layer be used
	# (quantizer is required to extract pseudo-labels for other losses)
	codebook_size: 300 # number of vectors in the quantization codebook per group
	num_groups: 2 # number of groups in the quantizer codebook
	num_negatives: 100 # number of sampled negatives for each target
	sample_from_same_utterance_only: true # should negatives be sampled only from the same utterance
	sample_from_non_masked: false # should negatives be sampled from non-masked steps

	mlm:
	decoder:
	_target_: nemo.collections.asr.modules.ConvASRDecoder
	feat_in: ${model.encoder.d_model}
	num_classes: 90000
	# set this to be equal to codebook_size^groups in the contrastive loss
	loss:
	_target_: nemo.collections.asr.losses.MLMLoss
	combine_time_steps: 4
	targets_from_loss: "contrastive"
	# since this loss requires targets, we can either get them from a manifest or from a quantized contrastive loss
	loss_alpha: 1000.
	# multiplier applied to this loss relative to others
	transpose_encoded: false
	# transposing input may be necessary depending on which layer is used as input to decoder
	start_step: 0
	# determines what global step this loss starts being used at;
	# this can be set to a higher number if your training is long enough,
	# which may increase early training stability
	output_from_layer: null
	# if we wanted to use outputs from non-final encoder layer as input to this decoder,
	# the layer name should be specified here


	optim:
	name: adamw
	lr: 5.0
	# optimizer arguments
	betas: [0.9, 0.98]
	weight_decay: 1e-3

	# scheduler setup
	sched:
	name: NoamAnnealing
	d_model: ${model.encoder.d_model}
	# scheduler config override
	warmup_steps: 25000
	warmup_ratio: null
	min_lr: 1e-6

	trainer:
	devices: -1 # number of GPUs, -1 would use all available GPUs
	num_nodes: 1
	max_epochs: 1000
	max_steps: -1 # computed at runtime if not set
	val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
	accelerator: auto
	strategy: ddp
	accumulate_grad_batches: 1
	gradient_clip_val: 1.0
	precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
	log_every_n_steps: 10 # Interval of logging.
	enable_progress_bar: True
	resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
	num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
	check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
	sync_batchnorm: true
	enable_checkpointing: False # Provided by exp_manager
	logger: false # Provided by exp_manager
	benchmark: false # needs to be false for models with variable-length speech input as it slows down training

	exp_manager:
	exp_dir: null
	name: ${name}
	create_tensorboard_logger: true
	create_checkpoint_callback: true
	checkpoint_callback_params:
	# in case of multiple validation sets, first one is used
	monitor: "val_loss"
	mode: "min"
	save_top_k: 5

	# you need to set these two to True to continue the training
	resume_if_exists: false
	resume_ignore_no_checkpoint: false

	# You may use this section to create a W&B logger
	create_wandb_logger: false
	wandb_logger_kwargs:
	name: null
	project: null