File size: 2,268 Bytes
7934b29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
name: TransformerLanguageModel
do_training: True # set to False if only preprocessing data
model:
label_smoothing: 0.0
preproc_out_dir: null # path to store data preprocessing outputs
train_ds:
file_name: ??? # path to file with training data
tokens_in_batch: 4096
clean: true
shuffle: true
num_workers: 8
# tarred dataset specific config
# use_tarred_dataset: true
# tar_files: ??? # path to tarred files
# metadata_file: ??? # metadata for tarred dataset
# shard_strategy: scatter
# tar_shuffle_n: 256
validation_ds:
file_name: ??? # path to file with validation data
tokens_in_batch: 512
clean: false
shuffle: false
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
test_ds:
file_name: ??? # path to file with test data
tokens_in_batch: 512
clean: false
shuffle: false
num_samples: -1
drop_last: false
pin_memory: false
num_workers: 8
optim:
name: adam
lr: 0.001
betas:
- 0.9
- 0.98
weight_decay: 0.0
sched:
name: InverseSquareRootAnnealing
min_lr: 0.0
last_epoch: -1
warmup_ratio: 0.1
tokenizer:
tokenizer_name: yttm
tokenizer_model: ???
vocab_file: null
special_tokens: null
training_sample_size: null # valid for sentencepiece tokenizer
encoder:
library: nemo
model_name: null
pretrained: false
max_sequence_length: 512
num_token_types: 0
embedding_dropout: 0.1
learn_positional_encodings: false
hidden_size: 512
num_layers: 6
inner_size: 2048
num_attention_heads: 8
ffn_dropout: 0.1
attn_score_dropout: 0.1
attn_layer_dropout: 0.1
hidden_act: relu
mask_future: true
pre_ln: false
head:
num_layers: 1
activation: relu
log_softmax: true
dropout: 0.0
trainer:
devices: 4
num_nodes: 1
max_epochs: 200
precision: 16 # Should be set to 16 for O1 and O2, default is 16 as PT ignores it when am_level is O0
accelerator: gpu
strategy: ddp
enable_checkpointing: False
logger: False
log_every_n_steps: 50 # Interval of logging.
check_val_every_n_epoch: 1
benchmark: False
exp_manager:
name: TransformerLM
files_to_copy: [] |