File size: 5,708 Bytes
fa6856c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
name: megatron_gpt_20b
restore_from_path: null  # used when starting from a .nemo file

trainer:
  devices: 8
  num_nodes: 4
  accelerator: gpu
  precision: bf16
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  replace_sampler_ddp: False
  max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
  max_steps: 200 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
  log_every_n_steps: 1
  val_check_interval: 20
  # check_val_every_n_epoch: null
  limit_val_batches: 2
  limit_test_batches: 0
  accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
  gradient_clip_val: 1.0
  benchmark: False

exp_manager:
  # set this to save checkpoints
  explicit_log_dir: ppo_sentiments_logs
  exp_dir: null
  name: megatron_gpt_20b_ppo_sentiments
  create_tensorboard_logger: False
  create_wandb_logger: False
  wandb_logger_kwargs:
    project: trlxnemo
    name: megatron_gpt_20b_ppo_sentiments
  resume_if_exists: False
  resume_ignore_no_checkpoint: True
  # set this to save checkpoints
  create_checkpoint_callback: False
  checkpoint_callback_params:
    monitor: reduced_train_loss
    save_top_k: 1
    mode: min
    always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
    save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
    filename: 'megatron_gpt-{reduced_train_loss:.2f}-{step}-{consumed_samples}'
    model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
  log_step_timing: True
  step_timing_kwargs:
    sync_cuda: True
    buffer_size: 5

model:
  micro_batch_size: 2
  global_batch_size: 64
  tensor_model_parallel_size: 4
  pipeline_model_parallel_size: 1
  resume_from_checkpoint: null # manually set the checkpoint file to load from
  # model architecture
  encoder_seq_length: 2048
  max_position_embeddings: 2048
  num_layers: 44
  hidden_size: 6144
  ffn_hidden_size: ${multiply:4, ${.hidden_size}}  # Transformer FFN hidden size. 4 * hidden_size.
  num_attention_heads: 48
  init_method_std: 0.007  # Standard deviation of the zero mean normal distribution used for weight initialization.')
  hidden_dropout: 0.1  # Dropout probability for hidden state transformer.
  kv_channels: null  # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
  apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
  layernorm_epsilon: 1e-5
  make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
  pre_process: True # add embedding
  post_process: True # add pooler
  persist_layer_norm: True # Use of persistent fused layer norm kernel.
  grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
  gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs


  ## Activation Checkpointing
  activations_checkpoint_granularity:  'selective' #'selective'  # 'selective' or 'full'
  activations_checkpoint_method: 'uniform' # 'uniform', 'block', not used with 'selective'
  activations_checkpoint_num_layers: null # not used with 'selective'

  ## Sequence Parallelism
  sequence_parallel: True

  tokenizer:
    library: 'megatron'
    type: 'GPT2BPETokenizer'
    model: null
    vocab_file: null
    merge_file: null
    delimiter: null # only used for tabular tokenizer
    sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers.

  # precision
  native_amp_init_scale: 4294967296 # 2 ** 32
  native_amp_growth_interval: 1000
  hysteresis: 2 # Gradient scale hysteresis
  fp32_residual_connection: False # Move residual connections to fp32
  fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16

  # Megatron O2-style half-precision
  megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
  grad_allreduce_chunk_size_mb: 125
  sync_batch_comm: False
  # miscellaneous
  seed: 1234
  use_cpu_initialization: False # Init weights on the CPU (slow for large models)
  onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
  apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
  gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

  data:
    data_prefix:
        - dataset: hh
    index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
    data_impl: mmap
    splits_string: 900,50,50
    seq_length: ${model.encoder_seq_length}
    skip_warmup: True
    num_workers: 2
    dataloader_type: cyclic
    reset_position_ids: False # Reset position ids after end-of-document token
    reset_attention_mask: False # Reset attention mask after end-of-document token
    eod_mask_loss: False # Mask loss for the end of document tokens

  # Nsys profiling options
  nsys_profile:
    enabled: False
    start_step: 10  # Global batch to start profiling
    end_step: 10 # Global batch to end profiling
    ranks: [0, 4, 8, 12] # Global rank IDs to profile
    gen_shape: False # Generate model and kernel details including input shapes

  optim:
    name: distributed_fused_adam
    lr: 6.0e-5
    weight_decay: 1.0e-6
    betas:
    - 0.9
    - 0.95
    sched:
      name: CosineAnnealing
      warmup_steps: 0
      constant_steps: 10000000
      min_lr: 5.0e-5