# BERT Pretraining from Text
name: &name PretrainingBERTFromText
trainer:
  devices: 1 # the number of gpus, 0 for CPU, or list with gpu indices
  num_nodes: 1
  max_epochs: 2 # the number of training epochs
  max_steps: -1 # precedence over max_epochs
  accumulate_grad_batches: 1 # accumulates grads every k batches
  precision: 16 # 16 to use AMP
  accelerator: gpu
  gradient_clip_val: 0.0
  log_every_n_steps: 1
  val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
  enable_checkpointing: False # provided by exp_manager
  logger: false # provided by exp_manager

model:
  nemo_path: null # exported .nemo path
  only_mlm_loss: false # only use masked language model without next sentence prediction
  num_tok_classification_layers: 1 # number of token classification head output layers
  num_seq_classification_layers: 2 # number of sequence classification head output layers
  max_seq_length: 128
  # The maximum total input sequence length after tokenization. Sequences longer than this
  # will be truncated, and sequences shorter than this will be padded.
  mask_prob: 0.15
  # Probability of masking a token in the input text during data processing.
  short_seq_prob: 0.1
  # Probability of having a sequence shorter than the maximum sequence length `max_seq_length` in data processing.",

  language_model:
    pretrained_model_name: bert-base-uncased
    lm_checkpoint: null
    config:
      attention_probs_dropout_prob: 0.1
      hidden_act: gelu
      hidden_dropout_prob: 0.1
      hidden_size: 768
      initializer_range: 0.02
      intermediate_size: 3072
      max_position_embeddings: 512
      num_attention_heads: 12
      num_hidden_layers: 12
      type_vocab_size: 2
      vocab_size: 30522
    config_file: null # json file, precedence over config

  tokenizer:
    tokenizer_name: ${model.language_model.pretrained_model_name} # tokenizer that inherits from TokenizerSpec
    vocab_file: null # path to vocab file
    tokenizer_model: null # tokenizer model for sentencepiece
    special_tokens: # only necessary for adding transformer/bert-specific special tokens to tokenizer if the tokenizer does not already have these inherently.
        unk_token: '[UNK]'
        sep_token: '[SEP]'
        pad_token: '[PAD]'
        bos_token: '[CLS]'
        mask_token: '[MASK]'
        eos_token: '[SEP]'
        cls_token: '[CLS]'

  train_ds:
    data_file: ??? # path to data file
    max_seq_length: ${model.max_seq_length}
    mask_prob: ${model.mask_prob}
    short_seq_prob: ${model.short_seq_prob}
    batch_size: 16 # per GPU
    shuffle: true
    num_samples: -1
    num_workers: 2
    drop_last: false
    pin_memory: false

  validation_ds:
    data_file: ??? # path to data file
    max_seq_length: ${model.max_seq_length}
    mask_prob: ${model.mask_prob}
    short_seq_prob: ${model.short_seq_prob}
    batch_size: 16 # per GPU
    shuffle: false
    num_samples: -1
    num_workers: 2
    drop_last: false
    pin_memory: false

  optim:
    name: adamw
    lr: 3e-5
    weight_decay: 0.0

    sched:
      name: CosineAnnealing
      warmup_steps: null
      warmup_ratio: 0.1
      min_lr: 0.0
      last_epoch: -1


exp_manager:
  exp_dir: null # where to store logs and checkpoints
  name: *name # name of experiment
  create_tensorboard_logger: True
  create_checkpoint_callback: True

hydra:
  run:
    dir: .
  job_logging:
    root:
      handlers: null