name: megatron_t0 trainer: devices: 1 num_nodes: 1 accelerator: gpu precision: 16 logger: False # logger provided by exp_manager enable_checkpointing: False replace_sampler_ddp: False max_epochs: -1 # PTL default. In practice, max_steps will be reached first. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 val_check_interval: 300 accumulate_grad_batches: 1 gradient_clip_val: 1.0 exp_manager: explicit_log_dir: null exp_dir: null name: megatron_t0 create_wandb_logger: False wandb_logger_kwargs: project: null name: null resume_if_exists: True resume_ignore_no_checkpoint: True create_checkpoint_callback: True checkpoint_callback_params: monitor: validation_${model.data.validation_ds.metric.name} save_top_k: 10 mode: max always_save_nemo: False # TODO: add support filename: 'megatron_t0--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' model_parallel_size: ${model.tensor_model_parallel_size} save_best_model: True model: restore_from_path: null # Path to a trained T5 .nemo file pretrained_checkpoint: checkpoint_dir: null # Path to a folder that contains a .ckpt file checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir. hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint. tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 pipeline_model_parallel_split_rank: 0 gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) megatron_amp_O2: False # Enable O2 optimization for megatron amp resume_from_checkpoint: null hidden_dropout: 0.1 # Override dropout prob from pretraining attention_dropout: 0.1 # Override attention dropout prob from pretraining data: train_ds: file_names: ??? # Path to a list of JSONL files corresponding to the source data. global_batch_size: 128 micro_batch_size: 16 shuffle: True num_workers: 8 pin_memory: True max_src_seq_length: 512 max_tgt_seq_length: 512 drop_last: True concat_sampling_probabilities: ??? # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' replace_bos_with_pad: False # Replaces bos with pad for both the encoder and decoder. This is necessary when using Google's T5 checkpoints. add_bos_to_input: False # Adds bos to the input sequence. add_eos_to_input: False # Adds eos to the input sequence. seed: 1234 validation_ds: file_names: ??? # Path to a list of JSONL files corresponding to the source data. names: null # Names of the corresponding datasets used to log metrics. global_batch_size: 16 micro_batch_size: 16 shuffle: False num_workers: 0 pin_memory: True max_src_seq_length: 512 max_tgt_seq_length: 512 drop_last: False # TODO: Figure out if there is a way to avoid dropping last. write_predictions_to_file: False output_file_path_prefix: null # Prefix of the file to write predictions to. metric: name: "exact_string_match" # Name of the evaluation metric to use. average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. num_classes: null replace_bos_with_pad: ${data.train_ds.replace_bos_with_pad} add_bos_to_input: ${data.train_ds.add_bos_to_input} add_eos_to_input: ${data.train_ds.add_eos_to_input} seed: 1234 optim: name: fused_adam lr: 5e-6 weight_decay: 0.0