NeMo / examples /asr /conf /ssl /conformer /conformer_ssl.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# This config contains the default values for self-supervised pre-training of a Conformer ASR model, large size (~120M).
# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of Conformer-CTC, other parameters are the same as in this config file.
# One extra layer (compared to original paper) is added to the medium and large variants to compensate for replacing the LSTM decoder with a linear one.
#
# +-------------+---------+---------+----------+------------+-----+
# | Model | d_model | n_heads | n_layers | time_masks | lr |
# +=============+=========+========+===========+============+=====+
# | Small (13M)| 176 | 4 | 16 | 5 | 5.0 |
# +-------------+---------+--------+-----------+------------+-----+
# | Medium (30M)| 256 | 4 | 18 | 5 | 5.0 |
# +-------------+---------+--------+-----------+------------+-----+
# | Large (121M)| 512 | 8 | 18 | 10 | 2.0 |
# +---------------------------------------------------------------+
#
# If you do not want to train with AMP, you may use weight decay of 0.0 or reduce the number of time maskings to 2
# with time_width=100. It may help when you want to train for fewer epochs and need faster convergence.
# With weight_decay=0.0, learning rate may need to get reduced to 2.0.
name: "Conformer-SSL"
model:
sample_rate: 16000
train_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16 # you may increase batch_size if your memory allows
shuffle: true
num_workers: 8
pin_memory: false
use_start_end_token: true
trim_silence: false
max_duration: 16.7
min_duration: 8.0
# tarred datasets
is_tarred: false
tarred_audio_filepaths: null
shuffle_n: 2048
# bucketing params
bucketing_strategy: "synced_randomized"
bucketing_batch_size: null
validation_ds:
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 16 # you may increase batch_size if your memory allows
shuffle: false
num_workers: 8
pin_memory: true
use_start_end_token: false
min_duration: 8.0
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
sample_rate: ${model.sample_rate}
normalize: "per_feature"
window_size: 0.025
window_stride: 0.01
window: "hann"
features: 80
n_fft: 512
log: true
frame_splicing: 1
dither: 0.00001
pad_to: 16
pad_value: 0.0
spec_augment:
_target_: nemo.collections.asr.modules.MaskedPatchAugmentation
freq_masks: 3
freq_width: 20
patch_size: 48
mask_patches: 0.5
encoder:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: ${model.preprocessor.features}
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 18
d_model: 512
# Sub-sampling params
subsampling: striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 4 # must be power of 2 for striding and vggnet
subsampling_conv_channels: -1 # -1 sets it to d_model
# Feed forward module's params
ff_expansion_factor: 4
# Multi-headed Attention Module's params
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [-1, -1] # -1 means unlimited context
xscaling: true # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 5000
# Convolution module's params
conv_kernel_size: 31
conv_norm_type: 'batch_norm' # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
### regularization
dropout: 0.1 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0.1 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0.1 # The dropout for multi-headed attention modules
decoder_out: 128
loss_list:
contrastive:
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction
feat_in: ${model.encoder.d_model}
feat_hidden: 128
# features in hidden layer of decoder
feat_out: ${model.decoder_out}
stride_layers: 0
# if loss.combine_time_steps is less than the encoder stride, then a corresponding amount of stride_layers needs to
# be added to the decoder (here stride and combine_time_steps are both 4)
non_stride_layers: 0
loss:
_target_: nemo.collections.asr.losses.ContrastiveLoss
in_dim: ${model.preprocessor.features}
proj_dim: ${model.decoder_out}
combine_time_steps: 4 # how many spectrogram time steps are used for one target/representation for contrastive task
quantized_targets: true # should quantizer or linear layer be used
# (quantizer is required to extract pseudo-labels for other losses)
codebook_size: 300 # number of vectors in the quantization codebook per group
num_groups: 2 # number of groups in the quantizer codebook
num_negatives: 100 # number of sampled negatives for each target
sample_from_same_utterance_only: true # should negatives be sampled only from the same utterance
sample_from_non_masked: false # should negatives be sampled from non-masked steps
mlm:
decoder:
_target_: nemo.collections.asr.modules.ConvASRDecoder
feat_in: ${model.encoder.d_model}
num_classes: 90000
# set this to be equal to codebook_size^groups in the contrastive loss
loss:
_target_: nemo.collections.asr.losses.MLMLoss
combine_time_steps: 4
targets_from_loss: "contrastive"
# since this loss requires targets, we can either get them from a manifest or from a quantized contrastive loss
loss_alpha: 1000.
# multiplier applied to this loss relative to others
transpose_encoded: false
# transposing input may be necessary depending on which layer is used as input to decoder
start_step: 0
# determines what global step this loss starts being used at;
# this can be set to a higher number if your training is long enough,
# which may increase early training stability
output_from_layer: null
# if we wanted to use outputs from non-final encoder layer as input to this decoder,
# the layer name should be specified here
optim:
name: adamw
lr: 5.0
# optimizer arguments
betas: [0.9, 0.98]
weight_decay: 1e-3
# scheduler setup
sched:
name: NoamAnnealing
d_model: ${model.encoder.d_model}
# scheduler config override
warmup_steps: 25000
warmup_ratio: null
min_lr: 1e-6
trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
num_nodes: 1
max_epochs: 1000
max_steps: -1 # computed at runtime if not set
val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
accelerator: auto
strategy: ddp
accumulate_grad_batches: 1
gradient_clip_val: 1.0
precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP.
log_every_n_steps: 10 # Interval of logging.
enable_progress_bar: True
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: true
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
benchmark: false # needs to be false for models with variable-length speech input as it slows down training
exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
# in case of multiple validation sets, first one is used
monitor: "val_loss"
mode: "min"
save_top_k: 5
# you need to set these two to True to continue the training
resume_if_exists: false
resume_ignore_no_checkpoint: false
# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null