# This config contains the default values for training a Citrinet model with CTC loss and BPE-based vocabulary. # Default learning parameters in this config are set for effective batch size of 1k on 32 GPUs. # To train it with smaller batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # If training for a short time, you can also reduce weight decay to 0. # Training Recipe # This model can be trained using the default settings in this config with FP32 precision. # When training under AMP, increase `warmup_steps` to 5000 for stable training. # In order to create Citrinet-C, find-replace `filters: 384` with `filters: C`. # When reducing the receptive field of these models, it is advised to reduce the amount of augmentation # for larger models from 10x time masking to 5x or 2x time masking. # For further details regarding Citrinet, visit - https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/asr/configs.html#citrinet name: &name "Citrinet-384-8x-Stride" model: sample_rate: &sample_rate 16000 train_ds: manifest_filepath: ??? sample_rate: 16000 batch_size: 32 trim_silence: false max_duration: 16.7 shuffle: true use_start_end_token: false num_workers: 8 pin_memory: true # tarred datasets is_tarred: false tarred_audio_filepaths: null tarred_shard_strategy: "scatter" shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: 16000 batch_size: 32 shuffle: false use_start_end_token: false num_workers: 8 pin_memory: true test_ds: manifest_filepath: null sample_rate: 16000 batch_size: 32 shuffle: false use_start_end_token: false num_workers: 8 pin_memory: true model_defaults: repeat: 5 dropout: 0.0 separable: true se: true se_context_size: -1 kernel_size_factor: 1.0 enc_final: 640 tokenizer: dir: ??? # path to directory which contains either tokenizer.model (bpe) or vocab.txt (for wpe) type: ??? # Can be either bpe or wpe preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: *sample_rate normalize: "per_feature" window_size: 0.025 window_stride: 0.01 window: "hann" features: &n_mels 80 n_fft: 512 frame_splicing: 1 dither: 0.00001 pad_to: 16 stft_conv: false spec_augment: _target_: nemo.collections.asr.modules.SpectrogramAugmentation freq_masks: 2 time_masks: 2 freq_width: 27 time_width: 0.05 encoder: _target_: nemo.collections.asr.modules.ConvASREncoder feat_in: *n_mels activation: relu conv_mask: true jasper: - filters: 384 repeat: 1 kernel: [5] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [11] stride: [2] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [13] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [15] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [17] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [19] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [21] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [13] stride: [2] # *stride dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [15] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [17] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [19] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [21] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [23] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [25] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [25] stride: [2] # stride dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [27] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [29] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [31] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [33] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [35] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [37] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: 384 repeat: ${model.model_defaults.repeat} kernel: [39] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.enc_final} repeat: 1 kernel: [41] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} decoder: _target_: nemo.collections.asr.modules.ConvASRDecoder feat_in: ${model.model_defaults.enc_final} num_classes: -1 # filled with vocabulary size from tokenizer at runtime vocabulary: [] # filled with vocabulary from tokenizer at runtime optim: name: novograd lr: 0.05 # optimizer arguments betas: [0.8, 0.25] weight_decay: 0.001 # scheduler setup sched: name: CosineAnnealing # scheduler config override warmup_steps: 1000 warmup_ratio: null min_lr: 1e-5 last_epoch: -1 trainer: devices: 1 # number of gpus max_epochs: 100 max_steps: -1 # computed at runtime if not set num_nodes: 1 accelerator: gpu strategy: ddp accumulate_grad_batches: 1 enable_checkpointing: False # Provided by exp_manager logger: false # Provided by exp_manager log_every_n_steps: 100 # Interval of logging. val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations check_val_every_n_epoch: 1 precision: 32 sync_batchnorm: false benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: *name create_tensorboard_logger: true create_checkpoint_callback: true checkpoint_callback_params: monitor: "val_wer" mode: "min" save_top_k: 3 create_wandb_logger: false wandb_logger_kwargs: name: null project: null entity: null resume_if_exists: false resume_ignore_no_checkpoint: false