# BERT Pretraining from Text name: &name PretrainingBERTFromText trainer: devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices num_nodes: 1 max_epochs: 2 # the number of training epochs max_steps: -1 # precedence over max_epochs accumulate_grad_batches: 1 # accumulates grads every k batches precision: 16 # 16 to use AMP accelerator: gpu gradient_clip_val: 0.0 log_every_n_steps: 1 val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch enable_checkpointing: False # provided by exp_manager logger: false # provided by exp_manager model: nemo_path: null # exported .nemo path only_mlm_loss: false # only use masked language model without next sentence prediction num_tok_classification_layers: 1 # number of token classification head output layers num_seq_classification_layers: 2 # number of sequence classification head output layers max_seq_length: 128 # The maximum total input sequence length after tokenization. Sequences longer than this # will be truncated, and sequences shorter than this will be padded. mask_prob: 0.15 # Probability of masking a token in the input text during data processing. short_seq_prob: 0.1 # Probability of having a sequence shorter than the maximum sequence length `max_seq_length` in data processing.", language_model: pretrained_model_name: bert-base-uncased lm_checkpoint: null config: attention_probs_dropout_prob: 0.1 hidden_act: gelu hidden_dropout_prob: 0.1 hidden_size: 768 initializer_range: 0.02 intermediate_size: 3072 max_position_embeddings: 512 num_attention_heads: 12 num_hidden_layers: 12 type_vocab_size: 2 vocab_size: 30522 config_file: null # json file, precedence over config tokenizer: tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec vocab_file: null # path to vocab file tokenizer_model: null # tokenizer model for sentencepiece special_tokens: # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently. unk_token: '[UNK]' sep_token: '[SEP]' pad_token: '[PAD]' bos_token: '[CLS]' mask_token: '[MASK]' eos_token: '[SEP]' cls_token: '[CLS]' train_ds: data_file: ??? # path to data file max_seq_length: ${model.max_seq_length} mask_prob: ${model.mask_prob} short_seq_prob: ${model.short_seq_prob} batch_size: 16 # per GPU shuffle: true num_samples: -1 num_workers: 2 drop_last: false pin_memory: false validation_ds: data_file: ??? # path to data file max_seq_length: ${model.max_seq_length} mask_prob: ${model.mask_prob} short_seq_prob: ${model.short_seq_prob} batch_size: 16 # per GPU shuffle: false num_samples: -1 num_workers: 2 drop_last: false pin_memory: false optim: name: adamw lr: 3e-5 weight_decay: 0.0 sched: name: CosineAnnealing warmup_steps: null warmup_ratio: 0.1 min_lr: 0.0 last_epoch: -1 exp_manager: exp_dir: null # where to store logs and checkpoints name: *name # name of experiment create_tensorboard_logger: True create_checkpoint_callback: True hydra: run: dir: . job_logging: root: handlers: null