name: TransformerLanguageModel
do_training: True # set to False if only preprocessing data

model:
  label_smoothing: 0.0
  preproc_out_dir: null # path to store data preprocessing outputs
  
  train_ds:
    file_name: ??? # path to file with training data
    tokens_in_batch: 4096
    clean: true
    shuffle: true
    num_workers: 8

    # tarred dataset specific config
    # use_tarred_dataset: true
    # tar_files: ??? # path to tarred files
    # metadata_file: ??? # metadata for tarred dataset
    # shard_strategy: scatter
    # tar_shuffle_n: 256

  validation_ds:
    file_name: ??? # path to file with validation data
    tokens_in_batch: 512
    clean: false
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8
    
  test_ds:
    file_name: ??? # path to file with test data
    tokens_in_batch: 512
    clean: false
    shuffle: false
    num_samples: -1
    drop_last: false
    pin_memory: false
    num_workers: 8
    
  optim:
    name: adam
    lr: 0.001
    betas:
      - 0.9
      - 0.98
    weight_decay: 0.0
    sched:
      name: InverseSquareRootAnnealing
      min_lr: 0.0
      last_epoch: -1
      warmup_ratio: 0.1

  tokenizer:
    tokenizer_name: yttm
    tokenizer_model: ???
    vocab_file: null
    special_tokens: null
    training_sample_size: null # valid for sentencepiece tokenizer

  encoder:
    library: nemo
    model_name: null
    pretrained: false
    max_sequence_length: 512
    num_token_types: 0
    embedding_dropout: 0.1
    learn_positional_encodings: false
    hidden_size: 512
    num_layers: 6
    inner_size: 2048
    num_attention_heads: 8
    ffn_dropout: 0.1
    attn_score_dropout: 0.1
    attn_layer_dropout: 0.1
    hidden_act: relu
    mask_future: true
    pre_ln: false

  head:
    num_layers: 1
    activation: relu
    log_softmax: true
    dropout: 0.0

trainer:
  devices: 4
  num_nodes: 1
  max_epochs: 200
  precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
  accelerator: gpu
  strategy: ddp
  enable_checkpointing: False
  logger: False
  log_every_n_steps: 50  # Interval of logging.
  check_val_every_n_epoch: 1
  benchmark: False

exp_manager:
  name: TransformerLM
  files_to_copy: []