{ "best_metric": 0.08321517705917358, "best_model_checkpoint": "./fine-tuned/checkpoint-6000", "epoch": 1.04, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 14499.107421875, "learning_rate": 2.9919999999999998e-05, "loss": 0.3351, "step": 50 }, { "epoch": 0.016, "grad_norm": 9562.1748046875, "learning_rate": 2.9840000000000002e-05, "loss": 0.0964, "step": 100 }, { "epoch": 0.024, "grad_norm": 11098.59375, "learning_rate": 2.976e-05, "loss": 0.0895, "step": 150 }, { "epoch": 0.032, "grad_norm": 9281.0146484375, "learning_rate": 2.968e-05, "loss": 0.0797, "step": 200 }, { "epoch": 0.04, "grad_norm": 10050.3623046875, "learning_rate": 2.96e-05, "loss": 0.0812, "step": 250 }, { "epoch": 0.048, "grad_norm": 7611.0849609375, "learning_rate": 2.9520000000000002e-05, "loss": 0.0755, "step": 300 }, { "epoch": 0.056, "grad_norm": 9915.1259765625, "learning_rate": 2.944e-05, "loss": 0.0793, "step": 350 }, { "epoch": 0.064, "grad_norm": 10182.263671875, "learning_rate": 2.936e-05, "loss": 0.0775, "step": 400 }, { "epoch": 0.072, "grad_norm": 11287.8271484375, "learning_rate": 2.928e-05, "loss": 0.0782, "step": 450 }, { "epoch": 0.08, "grad_norm": 6672.08251953125, "learning_rate": 2.92e-05, "loss": 0.0811, "step": 500 }, { "epoch": 0.08, "eval_loss": 0.09235642850399017, "eval_runtime": 116.7651, "eval_samples_per_second": 17.128, "eval_steps_per_second": 2.141, "step": 500 }, { "epoch": 0.088, "grad_norm": 6587.6513671875, "learning_rate": 2.9120000000000002e-05, "loss": 0.0815, "step": 550 }, { "epoch": 0.096, "grad_norm": 6632.0947265625, "learning_rate": 2.904e-05, "loss": 0.0794, "step": 600 }, { "epoch": 0.104, "grad_norm": 9301.228515625, "learning_rate": 2.896e-05, "loss": 0.076, "step": 650 }, { "epoch": 0.112, "grad_norm": 10575.0791015625, "learning_rate": 2.888e-05, "loss": 0.0791, "step": 700 }, { "epoch": 0.12, "grad_norm": 8609.86328125, "learning_rate": 2.88e-05, "loss": 0.0799, "step": 750 }, { "epoch": 0.128, "grad_norm": 11379.4423828125, "learning_rate": 2.8720000000000003e-05, "loss": 0.0759, "step": 800 }, { "epoch": 0.136, "grad_norm": 8489.6904296875, "learning_rate": 2.864e-05, "loss": 0.0753, "step": 850 }, { "epoch": 0.144, "grad_norm": 12353.6279296875, "learning_rate": 2.856e-05, "loss": 0.075, "step": 900 }, { "epoch": 0.152, "grad_norm": 11535.3994140625, "learning_rate": 2.8480000000000002e-05, "loss": 0.0757, "step": 950 }, { "epoch": 0.16, "grad_norm": 8291.939453125, "learning_rate": 2.84e-05, "loss": 0.0753, "step": 1000 }, { "epoch": 0.16, "eval_loss": 0.08949962258338928, "eval_runtime": 116.7407, "eval_samples_per_second": 17.132, "eval_steps_per_second": 2.141, "step": 1000 }, { "epoch": 0.168, "grad_norm": 8266.658203125, "learning_rate": 2.832e-05, "loss": 0.0767, "step": 1050 }, { "epoch": 0.176, "grad_norm": 6160.548828125, "learning_rate": 2.824e-05, "loss": 0.067, "step": 1100 }, { "epoch": 0.184, "grad_norm": 7343.408203125, "learning_rate": 2.816e-05, "loss": 0.0717, "step": 1150 }, { "epoch": 0.192, "grad_norm": 5661.76318359375, "learning_rate": 2.8080000000000002e-05, "loss": 0.0733, "step": 1200 }, { "epoch": 0.2, "grad_norm": 8678.46484375, "learning_rate": 2.8e-05, "loss": 0.0737, "step": 1250 }, { "epoch": 0.208, "grad_norm": 6331.21533203125, "learning_rate": 2.792e-05, "loss": 0.0696, "step": 1300 }, { "epoch": 0.216, "grad_norm": 10563.5400390625, "learning_rate": 2.784e-05, "loss": 0.0747, "step": 1350 }, { "epoch": 0.224, "grad_norm": 7221.74365234375, "learning_rate": 2.7760000000000002e-05, "loss": 0.0716, "step": 1400 }, { "epoch": 0.232, "grad_norm": 6486.46142578125, "learning_rate": 2.768e-05, "loss": 0.0711, "step": 1450 }, { "epoch": 0.24, "grad_norm": 6838.505859375, "learning_rate": 2.7600000000000003e-05, "loss": 0.0703, "step": 1500 }, { "epoch": 0.24, "eval_loss": 0.08808805048465729, "eval_runtime": 116.8722, "eval_samples_per_second": 17.113, "eval_steps_per_second": 2.139, "step": 1500 }, { "epoch": 0.248, "grad_norm": 6751.6494140625, "learning_rate": 2.752e-05, "loss": 0.0781, "step": 1550 }, { "epoch": 0.256, "grad_norm": 5040.9033203125, "learning_rate": 2.7439999999999998e-05, "loss": 0.0686, "step": 1600 }, { "epoch": 0.264, "grad_norm": 8748.07421875, "learning_rate": 2.7360000000000002e-05, "loss": 0.0689, "step": 1650 }, { "epoch": 0.272, "grad_norm": 5971.705078125, "learning_rate": 2.728e-05, "loss": 0.0671, "step": 1700 }, { "epoch": 0.28, "grad_norm": 10833.1357421875, "learning_rate": 2.72e-05, "loss": 0.0734, "step": 1750 }, { "epoch": 0.288, "grad_norm": 10036.919921875, "learning_rate": 2.712e-05, "loss": 0.0715, "step": 1800 }, { "epoch": 0.296, "grad_norm": 7755.1669921875, "learning_rate": 2.704e-05, "loss": 0.0669, "step": 1850 }, { "epoch": 0.304, "grad_norm": 7584.822265625, "learning_rate": 2.696e-05, "loss": 0.0699, "step": 1900 }, { "epoch": 0.312, "grad_norm": 10103.142578125, "learning_rate": 2.688e-05, "loss": 0.07, "step": 1950 }, { "epoch": 0.32, "grad_norm": 5768.24267578125, "learning_rate": 2.68e-05, "loss": 0.0709, "step": 2000 }, { "epoch": 0.32, "eval_loss": 0.08704760670661926, "eval_runtime": 116.8362, "eval_samples_per_second": 17.118, "eval_steps_per_second": 2.14, "step": 2000 }, { "epoch": 0.328, "grad_norm": 6016.46826171875, "learning_rate": 2.672e-05, "loss": 0.0663, "step": 2050 }, { "epoch": 0.336, "grad_norm": 6869.53076171875, "learning_rate": 2.6640000000000002e-05, "loss": 0.073, "step": 2100 }, { "epoch": 0.344, "grad_norm": 6099.595703125, "learning_rate": 2.656e-05, "loss": 0.0667, "step": 2150 }, { "epoch": 0.352, "grad_norm": 6923.919921875, "learning_rate": 2.648e-05, "loss": 0.0653, "step": 2200 }, { "epoch": 0.36, "grad_norm": 8005.85595703125, "learning_rate": 2.64e-05, "loss": 0.0685, "step": 2250 }, { "epoch": 0.368, "grad_norm": 6473.466796875, "learning_rate": 2.632e-05, "loss": 0.0678, "step": 2300 }, { "epoch": 0.376, "grad_norm": 7177.6328125, "learning_rate": 2.6240000000000003e-05, "loss": 0.0637, "step": 2350 }, { "epoch": 0.384, "grad_norm": 5574.75439453125, "learning_rate": 2.616e-05, "loss": 0.0698, "step": 2400 }, { "epoch": 0.392, "grad_norm": 6910.39599609375, "learning_rate": 2.608e-05, "loss": 0.0645, "step": 2450 }, { "epoch": 0.4, "grad_norm": 5913.9775390625, "learning_rate": 2.6000000000000002e-05, "loss": 0.068, "step": 2500 }, { "epoch": 0.4, "eval_loss": 0.08615937829017639, "eval_runtime": 116.9591, "eval_samples_per_second": 17.1, "eval_steps_per_second": 2.137, "step": 2500 }, { "epoch": 0.408, "grad_norm": 7447.5625, "learning_rate": 2.592e-05, "loss": 0.0672, "step": 2550 }, { "epoch": 0.416, "grad_norm": 7057.10009765625, "learning_rate": 2.584e-05, "loss": 0.0683, "step": 2600 }, { "epoch": 0.424, "grad_norm": 8279.7392578125, "learning_rate": 2.576e-05, "loss": 0.0631, "step": 2650 }, { "epoch": 0.432, "grad_norm": 7663.275390625, "learning_rate": 2.568e-05, "loss": 0.0698, "step": 2700 }, { "epoch": 0.44, "grad_norm": 7116.74609375, "learning_rate": 2.5600000000000002e-05, "loss": 0.0703, "step": 2750 }, { "epoch": 0.448, "grad_norm": 8839.5986328125, "learning_rate": 2.552e-05, "loss": 0.0654, "step": 2800 }, { "epoch": 0.456, "grad_norm": 7157.17333984375, "learning_rate": 2.544e-05, "loss": 0.0628, "step": 2850 }, { "epoch": 0.464, "grad_norm": 7690.267578125, "learning_rate": 2.536e-05, "loss": 0.0694, "step": 2900 }, { "epoch": 0.472, "grad_norm": 5030.39501953125, "learning_rate": 2.5280000000000002e-05, "loss": 0.0654, "step": 2950 }, { "epoch": 0.48, "grad_norm": 7269.51171875, "learning_rate": 2.52e-05, "loss": 0.0732, "step": 3000 }, { "epoch": 0.48, "eval_loss": 0.08551913499832153, "eval_runtime": 116.545, "eval_samples_per_second": 17.161, "eval_steps_per_second": 2.145, "step": 3000 }, { "epoch": 0.488, "grad_norm": 7060.21826171875, "learning_rate": 2.5120000000000003e-05, "loss": 0.0684, "step": 3050 }, { "epoch": 0.496, "grad_norm": 7841.55322265625, "learning_rate": 2.504e-05, "loss": 0.0653, "step": 3100 }, { "epoch": 0.504, "grad_norm": 5290.3271484375, "learning_rate": 2.4959999999999998e-05, "loss": 0.0668, "step": 3150 }, { "epoch": 0.512, "grad_norm": 6200.4853515625, "learning_rate": 2.4880000000000002e-05, "loss": 0.0665, "step": 3200 }, { "epoch": 0.52, "grad_norm": 6859.83544921875, "learning_rate": 2.48e-05, "loss": 0.0678, "step": 3250 }, { "epoch": 0.528, "grad_norm": 7718.70068359375, "learning_rate": 2.472e-05, "loss": 0.0679, "step": 3300 }, { "epoch": 0.536, "grad_norm": 10752.4873046875, "learning_rate": 2.464e-05, "loss": 0.062, "step": 3350 }, { "epoch": 0.544, "grad_norm": 6991.5087890625, "learning_rate": 2.456e-05, "loss": 0.0659, "step": 3400 }, { "epoch": 0.552, "grad_norm": 6204.99658203125, "learning_rate": 2.448e-05, "loss": 0.0636, "step": 3450 }, { "epoch": 0.56, "grad_norm": 13521.5908203125, "learning_rate": 2.44e-05, "loss": 0.0671, "step": 3500 }, { "epoch": 0.56, "eval_loss": 0.08540560305118561, "eval_runtime": 116.9131, "eval_samples_per_second": 17.107, "eval_steps_per_second": 2.138, "step": 3500 }, { "epoch": 0.568, "grad_norm": 6408.47265625, "learning_rate": 2.432e-05, "loss": 0.0652, "step": 3550 }, { "epoch": 0.576, "grad_norm": 5537.69287109375, "learning_rate": 2.4240000000000002e-05, "loss": 0.0633, "step": 3600 }, { "epoch": 0.584, "grad_norm": 7664.20703125, "learning_rate": 2.4160000000000002e-05, "loss": 0.0652, "step": 3650 }, { "epoch": 0.592, "grad_norm": 5726.9697265625, "learning_rate": 2.408e-05, "loss": 0.0667, "step": 3700 }, { "epoch": 0.6, "grad_norm": 6898.275390625, "learning_rate": 2.4e-05, "loss": 0.0675, "step": 3750 }, { "epoch": 0.608, "grad_norm": 9309.822265625, "learning_rate": 2.392e-05, "loss": 0.0668, "step": 3800 }, { "epoch": 0.616, "grad_norm": 8566.080078125, "learning_rate": 2.384e-05, "loss": 0.064, "step": 3850 }, { "epoch": 0.624, "grad_norm": 5729.54833984375, "learning_rate": 2.3760000000000003e-05, "loss": 0.0635, "step": 3900 }, { "epoch": 0.632, "grad_norm": 9562.8701171875, "learning_rate": 2.368e-05, "loss": 0.0643, "step": 3950 }, { "epoch": 0.64, "grad_norm": 4704.76025390625, "learning_rate": 2.3599999999999998e-05, "loss": 0.0649, "step": 4000 }, { "epoch": 0.64, "eval_loss": 0.08466340601444244, "eval_runtime": 116.6411, "eval_samples_per_second": 17.147, "eval_steps_per_second": 2.143, "step": 4000 }, { "epoch": 0.648, "grad_norm": 7243.01611328125, "learning_rate": 2.3520000000000002e-05, "loss": 0.0622, "step": 4050 }, { "epoch": 0.656, "grad_norm": 7986.32568359375, "learning_rate": 2.344e-05, "loss": 0.0678, "step": 4100 }, { "epoch": 0.664, "grad_norm": 9114.8974609375, "learning_rate": 2.336e-05, "loss": 0.0671, "step": 4150 }, { "epoch": 0.672, "grad_norm": 8830.62109375, "learning_rate": 2.328e-05, "loss": 0.0679, "step": 4200 }, { "epoch": 0.68, "grad_norm": 9311.2412109375, "learning_rate": 2.32e-05, "loss": 0.063, "step": 4250 }, { "epoch": 0.688, "grad_norm": 31307.103515625, "learning_rate": 2.3120000000000002e-05, "loss": 0.0649, "step": 4300 }, { "epoch": 0.696, "grad_norm": 9040.0126953125, "learning_rate": 2.304e-05, "loss": 0.0633, "step": 4350 }, { "epoch": 0.704, "grad_norm": 7183.91650390625, "learning_rate": 2.296e-05, "loss": 0.0582, "step": 4400 }, { "epoch": 0.712, "grad_norm": 6460.2998046875, "learning_rate": 2.288e-05, "loss": 0.0672, "step": 4450 }, { "epoch": 0.72, "grad_norm": 6104.8671875, "learning_rate": 2.2800000000000002e-05, "loss": 0.0597, "step": 4500 }, { "epoch": 0.72, "eval_loss": 0.0842796117067337, "eval_runtime": 116.9361, "eval_samples_per_second": 17.103, "eval_steps_per_second": 2.138, "step": 4500 }, { "epoch": 0.728, "grad_norm": 7553.5556640625, "learning_rate": 2.272e-05, "loss": 0.063, "step": 4550 }, { "epoch": 0.736, "grad_norm": 7194.16162109375, "learning_rate": 2.2640000000000003e-05, "loss": 0.0597, "step": 4600 }, { "epoch": 0.744, "grad_norm": 7578.23583984375, "learning_rate": 2.256e-05, "loss": 0.0627, "step": 4650 }, { "epoch": 0.752, "grad_norm": 7874.51904296875, "learning_rate": 2.2479999999999998e-05, "loss": 0.0628, "step": 4700 }, { "epoch": 0.76, "grad_norm": 6014.06640625, "learning_rate": 2.2400000000000002e-05, "loss": 0.0651, "step": 4750 }, { "epoch": 0.768, "grad_norm": 7170.10400390625, "learning_rate": 2.232e-05, "loss": 0.0656, "step": 4800 }, { "epoch": 0.776, "grad_norm": 7596.84326171875, "learning_rate": 2.224e-05, "loss": 0.0598, "step": 4850 }, { "epoch": 0.784, "grad_norm": 7802.14990234375, "learning_rate": 2.216e-05, "loss": 0.0605, "step": 4900 }, { "epoch": 0.792, "grad_norm": 5468.1845703125, "learning_rate": 2.208e-05, "loss": 0.0594, "step": 4950 }, { "epoch": 0.8, "grad_norm": 5185.58642578125, "learning_rate": 2.2e-05, "loss": 0.0586, "step": 5000 }, { "epoch": 0.8, "eval_loss": 0.08396206796169281, "eval_runtime": 116.8224, "eval_samples_per_second": 17.12, "eval_steps_per_second": 2.14, "step": 5000 }, { "epoch": 0.808, "grad_norm": 6047.43359375, "learning_rate": 2.192e-05, "loss": 0.0673, "step": 5050 }, { "epoch": 0.816, "grad_norm": 6286.21484375, "learning_rate": 2.184e-05, "loss": 0.0609, "step": 5100 }, { "epoch": 0.824, "grad_norm": 6187.03369140625, "learning_rate": 2.1760000000000002e-05, "loss": 0.0628, "step": 5150 }, { "epoch": 0.832, "grad_norm": 4476.73095703125, "learning_rate": 2.1680000000000002e-05, "loss": 0.0626, "step": 5200 }, { "epoch": 0.84, "grad_norm": 6180.27490234375, "learning_rate": 2.16e-05, "loss": 0.061, "step": 5250 }, { "epoch": 0.848, "grad_norm": 8477.626953125, "learning_rate": 2.152e-05, "loss": 0.0638, "step": 5300 }, { "epoch": 0.856, "grad_norm": 11541.119140625, "learning_rate": 2.144e-05, "loss": 0.0602, "step": 5350 }, { "epoch": 0.864, "grad_norm": 6183.49609375, "learning_rate": 2.136e-05, "loss": 0.0645, "step": 5400 }, { "epoch": 0.872, "grad_norm": 7597.5810546875, "learning_rate": 2.1280000000000003e-05, "loss": 0.067, "step": 5450 }, { "epoch": 0.88, "grad_norm": 8438.478515625, "learning_rate": 2.12e-05, "loss": 0.0628, "step": 5500 }, { "epoch": 0.88, "eval_loss": 0.08360794186592102, "eval_runtime": 116.6576, "eval_samples_per_second": 17.144, "eval_steps_per_second": 2.143, "step": 5500 }, { "epoch": 0.888, "grad_norm": 8200.35546875, "learning_rate": 2.1119999999999998e-05, "loss": 0.0676, "step": 5550 }, { "epoch": 0.896, "grad_norm": 8816.8076171875, "learning_rate": 2.1040000000000002e-05, "loss": 0.0626, "step": 5600 }, { "epoch": 0.904, "grad_norm": 8886.630859375, "learning_rate": 2.096e-05, "loss": 0.0657, "step": 5650 }, { "epoch": 0.912, "grad_norm": 8212.525390625, "learning_rate": 2.088e-05, "loss": 0.0619, "step": 5700 }, { "epoch": 0.92, "grad_norm": 5723.00439453125, "learning_rate": 2.08e-05, "loss": 0.0623, "step": 5750 }, { "epoch": 0.928, "grad_norm": 8616.3349609375, "learning_rate": 2.072e-05, "loss": 0.063, "step": 5800 }, { "epoch": 0.936, "grad_norm": 7717.373046875, "learning_rate": 2.064e-05, "loss": 0.063, "step": 5850 }, { "epoch": 0.944, "grad_norm": 6325.8193359375, "learning_rate": 2.056e-05, "loss": 0.0628, "step": 5900 }, { "epoch": 0.952, "grad_norm": 6938.89111328125, "learning_rate": 2.048e-05, "loss": 0.0585, "step": 5950 }, { "epoch": 0.96, "grad_norm": 8704.166015625, "learning_rate": 2.04e-05, "loss": 0.0634, "step": 6000 }, { "epoch": 0.96, "eval_loss": 0.08321517705917358, "eval_runtime": 116.6701, "eval_samples_per_second": 17.142, "eval_steps_per_second": 2.143, "step": 6000 }, { "epoch": 0.968, "grad_norm": 5835.19189453125, "learning_rate": 2.0320000000000002e-05, "loss": 0.0643, "step": 6050 }, { "epoch": 0.976, "grad_norm": 5896.76318359375, "learning_rate": 2.024e-05, "loss": 0.0625, "step": 6100 }, { "epoch": 0.984, "grad_norm": 6958.45751953125, "learning_rate": 2.016e-05, "loss": 0.0657, "step": 6150 }, { "epoch": 0.992, "grad_norm": 4680.04736328125, "learning_rate": 2.008e-05, "loss": 0.0632, "step": 6200 }, { "epoch": 1.0, "grad_norm": 8230.8056640625, "learning_rate": 1.9999999999999998e-05, "loss": 0.0603, "step": 6250 }, { "epoch": 1.008, "grad_norm": 5693.77001953125, "learning_rate": 1.9920000000000002e-05, "loss": 0.0574, "step": 6300 }, { "epoch": 1.016, "grad_norm": 14030.3583984375, "learning_rate": 1.984e-05, "loss": 0.0563, "step": 6350 }, { "epoch": 1.024, "grad_norm": 11693.09375, "learning_rate": 1.976e-05, "loss": 0.0558, "step": 6400 }, { "epoch": 1.032, "grad_norm": 5772.1845703125, "learning_rate": 1.968e-05, "loss": 0.0544, "step": 6450 }, { "epoch": 1.04, "grad_norm": 8641.919921875, "learning_rate": 1.96e-05, "loss": 0.0606, "step": 6500 }, { "epoch": 1.04, "eval_loss": 0.08356834203004837, "eval_runtime": 116.7914, "eval_samples_per_second": 17.125, "eval_steps_per_second": 2.141, "step": 6500 } ], "logging_steps": 50, "max_steps": 18750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.166581030912e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }