name: TransformerLanguageModel do_training: True # set to False if only preprocessing data model: label_smoothing: 0.0 preproc_out_dir: null # path to store data preprocessing outputs train_ds: file_name: ??? # path to file with training data tokens_in_batch: 4096 clean: true shuffle: true num_workers: 8 # tarred dataset specific config # use_tarred_dataset: true # tar_files: ??? # path to tarred files # metadata_file: ??? # metadata for tarred dataset # shard_strategy: scatter # tar_shuffle_n: 256 validation_ds: file_name: ??? # path to file with validation data tokens_in_batch: 512 clean: false shuffle: false num_samples: -1 drop_last: false pin_memory: false num_workers: 8 test_ds: file_name: ??? # path to file with test data tokens_in_batch: 512 clean: false shuffle: false num_samples: -1 drop_last: false pin_memory: false num_workers: 8 optim: name: adam lr: 0.001 betas: - 0.9 - 0.98 weight_decay: 0.0 sched: name: InverseSquareRootAnnealing min_lr: 0.0 last_epoch: -1 warmup_ratio: 0.1 tokenizer: tokenizer_name: yttm tokenizer_model: ??? vocab_file: null special_tokens: null training_sample_size: null # valid for sentencepiece tokenizer encoder: library: nemo model_name: null pretrained: false max_sequence_length: 512 num_token_types: 0 embedding_dropout: 0.1 learn_positional_encodings: false hidden_size: 512 num_layers: 6 inner_size: 2048 num_attention_heads: 8 ffn_dropout: 0.1 attn_score_dropout: 0.1 attn_layer_dropout: 0.1 hidden_act: relu mask_future: true pre_ln: false head: num_layers: 1 activation: relu log_softmax: true dropout: 0.0 trainer: devices: 4 num_nodes: 1 max_epochs: 200 precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0 accelerator: gpu strategy: ddp enable_checkpointing: False logger: False log_every_n_steps: 50 # Interval of logging. check_val_every_n_epoch: 1 benchmark: False exp_manager: name: TransformerLM files_to_copy: []