File size: 2,220 Bytes
7934b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# BERT Pretraining from Preprocessed (tokenized) data
name: &name PretrainingBERTFromPreprocessed
trainer:
  devices: 8 # the number of gpus, 0 for CPU, or list with gpu indices
  num_nodes: 1
  max_steps: 2285714 # precedence over max_epochs
  num_sanity_val_steps: 0 # needed for bert pretraining from preproc
  replace_sampler_ddp: false # needed for bert pretraining from preproc
  accumulate_grad_batches: 1 # accumulates grads every k batches
  precision: 16 # 16 to use AMP
  accelerator: gpu
  gradient_clip_val: 1.0
  log_every_n_steps: 1
  val_check_interval: 1.0 # check once per epoch .25 for 4 times per epoch
  enable_checkpointing: False # provided by exp_manager
  logger: false # provided by exp_manager

model:
  nemo_path: null # exported .nemo path
  only_mlm_loss: true # only use masked language model without next sentence prediction
  num_tok_classification_layers: 1 # number of token classification head output layers
  num_seq_classification_layers: 2 # number of sequence classification head output layers


  language_model:
    pretrained_model_name: bert-base-uncased # huggingface model name
    lm_checkpoint: null
    config:
      attention_probs_dropout_prob: 0.1
      hidden_act: gelu
      hidden_dropout_prob: 0.1
      hidden_size: 768
      initializer_range: 0.02
      intermediate_size: 3072
      max_position_embeddings: 512
      num_attention_heads: 12
      num_hidden_layers: 12
      type_vocab_size: 2
      vocab_size: 30522
    config_file: null # json file, precedence over config

  tokenizer: null

  train_ds:
    data_file: ??? # path to hdf5 file (or directory)
    max_predictions_per_seq: 80
    batch_size: 16
    shuffle: true
    num_samples: -1
    num_workers: 2
    drop_last: false
    pin_memory: false

  optim:
    name: adamw
    lr: 0.4375e-4
    weight_decay: 0.01

    sched:
      name: SquareRootAnnealing
      warmup_steps: null
      warmup_ratio: 0.01
      min_lr: 0.0
      last_epoch: -1


exp_manager:
  exp_dir: null # where to store logs and checkpoints
  name: *name # name of experiment
  create_tensorboard_logger: True
  create_checkpoint_callback: True


hydra:
  run:
    dir: .
  job_logging:
    root:
      handlers: null