|
{ |
|
"best_metric": 0.08551913499832153, |
|
"best_model_checkpoint": "./fine-tuned/checkpoint-3000", |
|
"epoch": 0.48, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 14499.107421875, |
|
"learning_rate": 2.9919999999999998e-05, |
|
"loss": 0.3351, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 9562.1748046875, |
|
"learning_rate": 2.9840000000000002e-05, |
|
"loss": 0.0964, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 11098.59375, |
|
"learning_rate": 2.976e-05, |
|
"loss": 0.0895, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 9281.0146484375, |
|
"learning_rate": 2.968e-05, |
|
"loss": 0.0797, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 10050.3623046875, |
|
"learning_rate": 2.96e-05, |
|
"loss": 0.0812, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 7611.0849609375, |
|
"learning_rate": 2.9520000000000002e-05, |
|
"loss": 0.0755, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 9915.1259765625, |
|
"learning_rate": 2.944e-05, |
|
"loss": 0.0793, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 10182.263671875, |
|
"learning_rate": 2.936e-05, |
|
"loss": 0.0775, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 11287.8271484375, |
|
"learning_rate": 2.928e-05, |
|
"loss": 0.0782, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6672.08251953125, |
|
"learning_rate": 2.92e-05, |
|
"loss": 0.0811, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.09235642850399017, |
|
"eval_runtime": 109.274, |
|
"eval_samples_per_second": 18.303, |
|
"eval_steps_per_second": 2.288, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 6587.6513671875, |
|
"learning_rate": 2.9120000000000002e-05, |
|
"loss": 0.0815, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 6632.0947265625, |
|
"learning_rate": 2.904e-05, |
|
"loss": 0.0794, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 9301.228515625, |
|
"learning_rate": 2.896e-05, |
|
"loss": 0.076, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 10575.0791015625, |
|
"learning_rate": 2.888e-05, |
|
"loss": 0.0791, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 8609.86328125, |
|
"learning_rate": 2.88e-05, |
|
"loss": 0.0799, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 11379.4423828125, |
|
"learning_rate": 2.8720000000000003e-05, |
|
"loss": 0.0759, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 8489.6904296875, |
|
"learning_rate": 2.864e-05, |
|
"loss": 0.0753, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 12353.6279296875, |
|
"learning_rate": 2.856e-05, |
|
"loss": 0.075, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 11535.3994140625, |
|
"learning_rate": 2.8480000000000002e-05, |
|
"loss": 0.0757, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8291.939453125, |
|
"learning_rate": 2.84e-05, |
|
"loss": 0.0753, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_loss": 0.08949962258338928, |
|
"eval_runtime": 109.2536, |
|
"eval_samples_per_second": 18.306, |
|
"eval_steps_per_second": 2.288, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 8266.658203125, |
|
"learning_rate": 2.832e-05, |
|
"loss": 0.0767, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 6160.548828125, |
|
"learning_rate": 2.824e-05, |
|
"loss": 0.067, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 7343.408203125, |
|
"learning_rate": 2.816e-05, |
|
"loss": 0.0717, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 5661.76318359375, |
|
"learning_rate": 2.8080000000000002e-05, |
|
"loss": 0.0733, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 8678.46484375, |
|
"learning_rate": 2.8e-05, |
|
"loss": 0.0737, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 6331.21533203125, |
|
"learning_rate": 2.792e-05, |
|
"loss": 0.0696, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 10563.5400390625, |
|
"learning_rate": 2.784e-05, |
|
"loss": 0.0747, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 7221.74365234375, |
|
"learning_rate": 2.7760000000000002e-05, |
|
"loss": 0.0716, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 6486.46142578125, |
|
"learning_rate": 2.768e-05, |
|
"loss": 0.0711, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6838.505859375, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 0.0703, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_loss": 0.08808805048465729, |
|
"eval_runtime": 109.2355, |
|
"eval_samples_per_second": 18.309, |
|
"eval_steps_per_second": 2.289, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 6751.6494140625, |
|
"learning_rate": 2.752e-05, |
|
"loss": 0.0781, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 5040.9033203125, |
|
"learning_rate": 2.7439999999999998e-05, |
|
"loss": 0.0686, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 8748.07421875, |
|
"learning_rate": 2.7360000000000002e-05, |
|
"loss": 0.0689, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 5971.705078125, |
|
"learning_rate": 2.728e-05, |
|
"loss": 0.0671, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 10833.1357421875, |
|
"learning_rate": 2.72e-05, |
|
"loss": 0.0734, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 10036.919921875, |
|
"learning_rate": 2.712e-05, |
|
"loss": 0.0715, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 7755.1669921875, |
|
"learning_rate": 2.704e-05, |
|
"loss": 0.0669, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 7584.822265625, |
|
"learning_rate": 2.696e-05, |
|
"loss": 0.0699, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 10103.142578125, |
|
"learning_rate": 2.688e-05, |
|
"loss": 0.07, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5768.24267578125, |
|
"learning_rate": 2.68e-05, |
|
"loss": 0.0709, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 0.08704760670661926, |
|
"eval_runtime": 109.4348, |
|
"eval_samples_per_second": 18.276, |
|
"eval_steps_per_second": 2.284, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 6016.46826171875, |
|
"learning_rate": 2.672e-05, |
|
"loss": 0.0663, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 6869.53076171875, |
|
"learning_rate": 2.6640000000000002e-05, |
|
"loss": 0.073, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 6099.595703125, |
|
"learning_rate": 2.656e-05, |
|
"loss": 0.0667, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 6923.919921875, |
|
"learning_rate": 2.648e-05, |
|
"loss": 0.0653, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 8005.85595703125, |
|
"learning_rate": 2.64e-05, |
|
"loss": 0.0685, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 6473.466796875, |
|
"learning_rate": 2.632e-05, |
|
"loss": 0.0678, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 7177.6328125, |
|
"learning_rate": 2.6240000000000003e-05, |
|
"loss": 0.0637, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 5574.75439453125, |
|
"learning_rate": 2.616e-05, |
|
"loss": 0.0698, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 6910.39599609375, |
|
"learning_rate": 2.608e-05, |
|
"loss": 0.0645, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5913.9775390625, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.068, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.08615937829017639, |
|
"eval_runtime": 109.2621, |
|
"eval_samples_per_second": 18.305, |
|
"eval_steps_per_second": 2.288, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 7447.5625, |
|
"learning_rate": 2.592e-05, |
|
"loss": 0.0672, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 7057.10009765625, |
|
"learning_rate": 2.584e-05, |
|
"loss": 0.0683, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 8279.7392578125, |
|
"learning_rate": 2.576e-05, |
|
"loss": 0.0631, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 7663.275390625, |
|
"learning_rate": 2.568e-05, |
|
"loss": 0.0698, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7116.74609375, |
|
"learning_rate": 2.5600000000000002e-05, |
|
"loss": 0.0703, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 8839.5986328125, |
|
"learning_rate": 2.552e-05, |
|
"loss": 0.0654, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 7157.17333984375, |
|
"learning_rate": 2.544e-05, |
|
"loss": 0.0628, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 7690.267578125, |
|
"learning_rate": 2.536e-05, |
|
"loss": 0.0694, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 5030.39501953125, |
|
"learning_rate": 2.5280000000000002e-05, |
|
"loss": 0.0654, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 7269.51171875, |
|
"learning_rate": 2.52e-05, |
|
"loss": 0.0732, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.08551913499832153, |
|
"eval_runtime": 109.2626, |
|
"eval_samples_per_second": 18.305, |
|
"eval_steps_per_second": 2.288, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 18750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.461498937344e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|