Spaces:
Runtime error
Runtime error
File size: 5,708 Bytes
fa6856c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
name: megatron_gpt_20b
restore_from_path: null # used when starting from a .nemo file
trainer:
devices: 8
num_nodes: 4
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
max_epochs: -1 # PTL default. In practice, max_steps will be reached first.
max_steps: 200 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches
log_every_n_steps: 1
val_check_interval: 20
# check_val_every_n_epoch: null
limit_val_batches: 2
limit_test_batches: 0
accumulate_grad_batches: 1 # do not modify, grad acc is automatic for training megatron models
gradient_clip_val: 1.0
benchmark: False
exp_manager:
# set this to save checkpoints
explicit_log_dir: ppo_sentiments_logs
exp_dir: null
name: megatron_gpt_20b_ppo_sentiments
create_tensorboard_logger: False
create_wandb_logger: False
wandb_logger_kwargs:
project: trlxnemo
name: megatron_gpt_20b_ppo_sentiments
resume_if_exists: False
resume_ignore_no_checkpoint: True
# set this to save checkpoints
create_checkpoint_callback: False
checkpoint_callback_params:
monitor: reduced_train_loss
save_top_k: 1
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: True # not recommended when training large models on clusters with short time limits
filename: 'megatron_gpt-{reduced_train_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${model.tensor_model_parallel_size}, ${model.pipeline_model_parallel_size}}
log_step_timing: True
step_timing_kwargs:
sync_cuda: True
buffer_size: 5
model:
micro_batch_size: 2
global_batch_size: 64
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 1
resume_from_checkpoint: null # manually set the checkpoint file to load from
# model architecture
encoder_seq_length: 2048
max_position_embeddings: 2048
num_layers: 44
hidden_size: 6144
ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size.
num_attention_heads: 48
init_method_std: 0.007 # Standard deviation of the zero mean normal distribution used for weight initialization.')
hidden_dropout: 0.1 # Dropout probability for hidden state transformer.
kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null
apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number.
layernorm_epsilon: 1e-5
make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency.
pre_process: True # add embedding
post_process: True # add pooler
persist_layer_norm: True # Use of persistent fused layer norm kernel.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs
## Activation Checkpointing
activations_checkpoint_granularity: 'selective' #'selective' # 'selective' or 'full'
activations_checkpoint_method: 'uniform' # 'uniform', 'block', not used with 'selective'
activations_checkpoint_num_layers: null # not used with 'selective'
## Sequence Parallelism
sequence_parallel: True
tokenizer:
library: 'megatron'
type: 'GPT2BPETokenizer'
model: null
vocab_file: null
merge_file: null
delimiter: null # only used for tabular tokenizer
sentencepiece_legacy: false # Legacy=True allows you to add special tokens to sentencepiece tokenizers.
# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
hysteresis: 2 # Gradient scale hysteresis
fp32_residual_connection: False # Move residual connections to fp32
fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16
# Megatron O2-style half-precision
megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters
grad_allreduce_chunk_size_mb: 125
sync_batch_comm: False
# miscellaneous
seed: 1234
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
data:
data_prefix:
- dataset: hh
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap
splits_string: 900,50,50
seq_length: ${model.encoder_seq_length}
skip_warmup: True
num_workers: 2
dataloader_type: cyclic
reset_position_ids: False # Reset position ids after end-of-document token
reset_attention_mask: False # Reset attention mask after end-of-document token
eod_mask_loss: False # Mask loss for the end of document tokens
# Nsys profiling options
nsys_profile:
enabled: False
start_step: 10 # Global batch to start profiling
end_step: 10 # Global batch to end profiling
ranks: [0, 4, 8, 12] # Global rank IDs to profile
gen_shape: False # Generate model and kernel details including input shapes
optim:
name: distributed_fused_adam
lr: 6.0e-5
weight_decay: 1.0e-6
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 0
constant_steps: 10000000
min_lr: 5.0e-5
|