|
name: TransformerLanguageModel |
|
do_training: True |
|
|
|
model: |
|
label_smoothing: 0.0 |
|
preproc_out_dir: null |
|
|
|
train_ds: |
|
file_name: ??? |
|
tokens_in_batch: 4096 |
|
clean: true |
|
shuffle: true |
|
num_workers: 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
validation_ds: |
|
file_name: ??? |
|
tokens_in_batch: 512 |
|
clean: false |
|
shuffle: false |
|
num_samples: -1 |
|
drop_last: false |
|
pin_memory: false |
|
num_workers: 8 |
|
|
|
test_ds: |
|
file_name: ??? |
|
tokens_in_batch: 512 |
|
clean: false |
|
shuffle: false |
|
num_samples: -1 |
|
drop_last: false |
|
pin_memory: false |
|
num_workers: 8 |
|
|
|
optim: |
|
name: adam |
|
lr: 0.001 |
|
betas: |
|
- 0.9 |
|
- 0.98 |
|
weight_decay: 0.0 |
|
sched: |
|
name: InverseSquareRootAnnealing |
|
min_lr: 0.0 |
|
last_epoch: -1 |
|
warmup_ratio: 0.1 |
|
|
|
tokenizer: |
|
tokenizer_name: yttm |
|
tokenizer_model: ??? |
|
vocab_file: null |
|
special_tokens: null |
|
training_sample_size: null |
|
|
|
encoder: |
|
library: nemo |
|
model_name: null |
|
pretrained: false |
|
max_sequence_length: 512 |
|
num_token_types: 0 |
|
embedding_dropout: 0.1 |
|
learn_positional_encodings: false |
|
hidden_size: 512 |
|
num_layers: 6 |
|
inner_size: 2048 |
|
num_attention_heads: 8 |
|
ffn_dropout: 0.1 |
|
attn_score_dropout: 0.1 |
|
attn_layer_dropout: 0.1 |
|
hidden_act: relu |
|
mask_future: true |
|
pre_ln: false |
|
|
|
head: |
|
num_layers: 1 |
|
activation: relu |
|
log_softmax: true |
|
dropout: 0.0 |
|
|
|
trainer: |
|
devices: 4 |
|
num_nodes: 1 |
|
max_epochs: 200 |
|
precision: 16 |
|
accelerator: gpu |
|
strategy: ddp |
|
enable_checkpointing: False |
|
logger: False |
|
log_every_n_steps: 50 |
|
check_val_every_n_epoch: 1 |
|
benchmark: False |
|
|
|
exp_manager: |
|
name: TransformerLM |
|
files_to_copy: [] |