{ "best_metric": 0.624975860118866, "best_model_checkpoint": "./model_fine-tune/glot/mbert/ben-Beng/checkpoint-99000", "epoch": 29.75653742110009, "eval_steps": 500, "global_step": 99000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15028554253080853, "grad_norm": 2.96537446975708, "learning_rate": 9.95e-05, "loss": 1.281, "step": 500 }, { "epoch": 0.15028554253080853, "eval_accuracy": 0.7522906509778075, "eval_loss": 1.32131826877594, "eval_runtime": 116.8245, "eval_samples_per_second": 204.152, "eval_steps_per_second": 6.386, "step": 500 }, { "epoch": 0.30057108506161706, "grad_norm": 3.2341372966766357, "learning_rate": 9.900000000000001e-05, "loss": 1.1448, "step": 1000 }, { "epoch": 0.30057108506161706, "eval_accuracy": 0.7658332431476369, "eval_loss": 1.2582815885543823, "eval_runtime": 116.1473, "eval_samples_per_second": 205.343, "eval_steps_per_second": 6.423, "step": 1000 }, { "epoch": 0.4508566275924256, "grad_norm": 2.849609613418579, "learning_rate": 9.850000000000001e-05, "loss": 1.0776, "step": 1500 }, { "epoch": 0.4508566275924256, "eval_accuracy": 0.7766924857873619, "eval_loss": 1.2013344764709473, "eval_runtime": 116.4982, "eval_samples_per_second": 204.724, "eval_steps_per_second": 6.404, "step": 1500 }, { "epoch": 0.6011421701232341, "grad_norm": 4.652078628540039, "learning_rate": 9.8e-05, "loss": 1.0351, "step": 2000 }, { "epoch": 0.6011421701232341, "eval_accuracy": 0.7827826654791789, "eval_loss": 1.1614470481872559, "eval_runtime": 116.2018, "eval_samples_per_second": 205.246, "eval_steps_per_second": 6.42, "step": 2000 }, { "epoch": 0.7514277126540427, "grad_norm": 2.517792224884033, "learning_rate": 9.75e-05, "loss": 0.9949, "step": 2500 }, { "epoch": 0.7514277126540427, "eval_accuracy": 0.7896993660264928, "eval_loss": 1.1289317607879639, "eval_runtime": 116.1479, "eval_samples_per_second": 205.342, "eval_steps_per_second": 6.423, "step": 2500 }, { "epoch": 0.9017132551848512, "grad_norm": 2.952677011489868, "learning_rate": 9.7e-05, "loss": 0.9672, "step": 3000 }, { "epoch": 0.9017132551848512, "eval_accuracy": 0.795600216965857, "eval_loss": 1.1054171323776245, "eval_runtime": 116.2913, "eval_samples_per_second": 205.088, "eval_steps_per_second": 6.415, "step": 3000 }, { "epoch": 1.0519987977156597, "grad_norm": 2.5801005363464355, "learning_rate": 9.65e-05, "loss": 0.9423, "step": 3500 }, { "epoch": 1.0519987977156597, "eval_accuracy": 0.7991275637592986, "eval_loss": 1.0799400806427002, "eval_runtime": 116.2346, "eval_samples_per_second": 205.188, "eval_steps_per_second": 6.418, "step": 3500 }, { "epoch": 1.2022843402464682, "grad_norm": 2.616811752319336, "learning_rate": 9.6e-05, "loss": 0.9192, "step": 4000 }, { "epoch": 1.2022843402464682, "eval_accuracy": 0.8025899505976251, "eval_loss": 1.0550587177276611, "eval_runtime": 107.6657, "eval_samples_per_second": 221.519, "eval_steps_per_second": 6.929, "step": 4000 }, { "epoch": 1.3525698827772767, "grad_norm": 2.4628713130950928, "learning_rate": 9.55e-05, "loss": 0.9063, "step": 4500 }, { "epoch": 1.3525698827772767, "eval_accuracy": 0.8070988314079973, "eval_loss": 1.0351369380950928, "eval_runtime": 116.5727, "eval_samples_per_second": 204.593, "eval_steps_per_second": 6.399, "step": 4500 }, { "epoch": 1.5028554253080855, "grad_norm": 2.5644257068634033, "learning_rate": 9.5e-05, "loss": 0.8852, "step": 5000 }, { "epoch": 1.5028554253080855, "eval_accuracy": 0.8092755125435529, "eval_loss": 1.0269399881362915, "eval_runtime": 116.4211, "eval_samples_per_second": 204.86, "eval_steps_per_second": 6.408, "step": 5000 }, { "epoch": 1.653140967838894, "grad_norm": 2.1794581413269043, "learning_rate": 9.449999999999999e-05, "loss": 0.8635, "step": 5500 }, { "epoch": 1.653140967838894, "eval_accuracy": 0.8118362943598405, "eval_loss": 1.0109528303146362, "eval_runtime": 116.2782, "eval_samples_per_second": 205.112, "eval_steps_per_second": 6.416, "step": 5500 }, { "epoch": 1.8034265103697025, "grad_norm": 1.9800032377243042, "learning_rate": 9.4e-05, "loss": 0.8589, "step": 6000 }, { "epoch": 1.8034265103697025, "eval_accuracy": 0.8126376708652329, "eval_loss": 1.0049536228179932, "eval_runtime": 116.3839, "eval_samples_per_second": 204.925, "eval_steps_per_second": 6.41, "step": 6000 }, { "epoch": 1.953712052900511, "grad_norm": 2.2795419692993164, "learning_rate": 9.350000000000001e-05, "loss": 0.8537, "step": 6500 }, { "epoch": 1.953712052900511, "eval_accuracy": 0.8167079140922375, "eval_loss": 0.985532283782959, "eval_runtime": 116.522, "eval_samples_per_second": 204.682, "eval_steps_per_second": 6.402, "step": 6500 }, { "epoch": 2.1039975954313195, "grad_norm": 2.3482625484466553, "learning_rate": 9.300000000000001e-05, "loss": 0.8346, "step": 7000 }, { "epoch": 2.1039975954313195, "eval_accuracy": 0.8181904210369662, "eval_loss": 0.9796605706214905, "eval_runtime": 116.5003, "eval_samples_per_second": 204.72, "eval_steps_per_second": 6.403, "step": 7000 }, { "epoch": 2.254283137962128, "grad_norm": 2.2391133308410645, "learning_rate": 9.250000000000001e-05, "loss": 0.8172, "step": 7500 }, { "epoch": 2.254283137962128, "eval_accuracy": 0.8213880263923928, "eval_loss": 0.9652304649353027, "eval_runtime": 116.3474, "eval_samples_per_second": 204.99, "eval_steps_per_second": 6.412, "step": 7500 }, { "epoch": 2.4045686804929365, "grad_norm": 2.2324562072753906, "learning_rate": 9.200000000000001e-05, "loss": 0.816, "step": 8000 }, { "epoch": 2.4045686804929365, "eval_accuracy": 0.8213665133403569, "eval_loss": 0.9561855792999268, "eval_runtime": 116.564, "eval_samples_per_second": 204.609, "eval_steps_per_second": 6.4, "step": 8000 }, { "epoch": 2.554854223023745, "grad_norm": 2.2165215015411377, "learning_rate": 9.15e-05, "loss": 0.8091, "step": 8500 }, { "epoch": 2.554854223023745, "eval_accuracy": 0.8230892410962204, "eval_loss": 0.9617434740066528, "eval_runtime": 116.2672, "eval_samples_per_second": 205.131, "eval_steps_per_second": 6.416, "step": 8500 }, { "epoch": 2.7051397655545535, "grad_norm": 2.2473552227020264, "learning_rate": 9.1e-05, "loss": 0.7923, "step": 9000 }, { "epoch": 2.7051397655545535, "eval_accuracy": 0.8254686724678374, "eval_loss": 0.9391178488731384, "eval_runtime": 116.632, "eval_samples_per_second": 204.489, "eval_steps_per_second": 6.396, "step": 9000 }, { "epoch": 2.855425308085362, "grad_norm": 2.2657456398010254, "learning_rate": 9.05e-05, "loss": 0.789, "step": 9500 }, { "epoch": 2.855425308085362, "eval_accuracy": 0.8255632325904414, "eval_loss": 0.9453052878379822, "eval_runtime": 116.4993, "eval_samples_per_second": 204.722, "eval_steps_per_second": 6.403, "step": 9500 }, { "epoch": 3.005710850616171, "grad_norm": 2.2472214698791504, "learning_rate": 9e-05, "loss": 0.7881, "step": 10000 }, { "epoch": 3.005710850616171, "eval_accuracy": 0.8270709423550963, "eval_loss": 0.9381970763206482, "eval_runtime": 116.5491, "eval_samples_per_second": 204.635, "eval_steps_per_second": 6.401, "step": 10000 }, { "epoch": 3.1559963931469794, "grad_norm": 2.0700597763061523, "learning_rate": 8.950000000000001e-05, "loss": 0.7725, "step": 10500 }, { "epoch": 3.1559963931469794, "eval_accuracy": 0.8300675249784237, "eval_loss": 0.9213970899581909, "eval_runtime": 116.7191, "eval_samples_per_second": 204.337, "eval_steps_per_second": 6.391, "step": 10500 }, { "epoch": 3.306281935677788, "grad_norm": 2.218325614929199, "learning_rate": 8.900000000000001e-05, "loss": 0.7644, "step": 11000 }, { "epoch": 3.306281935677788, "eval_accuracy": 0.8305223187234902, "eval_loss": 0.9182527661323547, "eval_runtime": 116.6654, "eval_samples_per_second": 204.431, "eval_steps_per_second": 6.394, "step": 11000 }, { "epoch": 3.4565674782085964, "grad_norm": 1.9723238945007324, "learning_rate": 8.850000000000001e-05, "loss": 0.7626, "step": 11500 }, { "epoch": 3.4565674782085964, "eval_accuracy": 0.8318465493554398, "eval_loss": 0.9129367470741272, "eval_runtime": 116.4783, "eval_samples_per_second": 204.759, "eval_steps_per_second": 6.405, "step": 11500 }, { "epoch": 3.606853020739405, "grad_norm": 2.30340313911438, "learning_rate": 8.800000000000001e-05, "loss": 0.7593, "step": 12000 }, { "epoch": 3.606853020739405, "eval_accuracy": 0.8328743337102246, "eval_loss": 0.9093960523605347, "eval_runtime": 116.6694, "eval_samples_per_second": 204.424, "eval_steps_per_second": 6.394, "step": 12000 }, { "epoch": 3.7571385632702134, "grad_norm": 2.323202133178711, "learning_rate": 8.75e-05, "loss": 0.7513, "step": 12500 }, { "epoch": 3.7571385632702134, "eval_accuracy": 0.8333174281476498, "eval_loss": 0.9067990183830261, "eval_runtime": 116.5807, "eval_samples_per_second": 204.579, "eval_steps_per_second": 6.399, "step": 12500 }, { "epoch": 3.907424105801022, "grad_norm": 4.191928863525391, "learning_rate": 8.7e-05, "loss": 0.7473, "step": 13000 }, { "epoch": 3.907424105801022, "eval_accuracy": 0.8345748218067248, "eval_loss": 0.9029770493507385, "eval_runtime": 116.5201, "eval_samples_per_second": 204.686, "eval_steps_per_second": 6.402, "step": 13000 }, { "epoch": 4.057709648331831, "grad_norm": 1.9715632200241089, "learning_rate": 8.65e-05, "loss": 0.7389, "step": 13500 }, { "epoch": 4.057709648331831, "eval_accuracy": 0.8357844505105652, "eval_loss": 0.8932846784591675, "eval_runtime": 116.7024, "eval_samples_per_second": 204.366, "eval_steps_per_second": 6.392, "step": 13500 }, { "epoch": 4.207995190862639, "grad_norm": 2.2631866931915283, "learning_rate": 8.6e-05, "loss": 0.7293, "step": 14000 }, { "epoch": 4.207995190862639, "eval_accuracy": 0.836265716981726, "eval_loss": 0.8889855146408081, "eval_runtime": 116.4568, "eval_samples_per_second": 204.797, "eval_steps_per_second": 6.406, "step": 14000 }, { "epoch": 4.358280733393448, "grad_norm": 2.2398159503936768, "learning_rate": 8.55e-05, "loss": 0.7295, "step": 14500 }, { "epoch": 4.358280733393448, "eval_accuracy": 0.8369460487347083, "eval_loss": 0.8780855536460876, "eval_runtime": 116.6183, "eval_samples_per_second": 204.513, "eval_steps_per_second": 6.397, "step": 14500 }, { "epoch": 4.508566275924256, "grad_norm": 2.099126100540161, "learning_rate": 8.5e-05, "loss": 0.7231, "step": 15000 }, { "epoch": 4.508566275924256, "eval_accuracy": 0.8387421918446, "eval_loss": 0.8813783526420593, "eval_runtime": 116.5919, "eval_samples_per_second": 204.56, "eval_steps_per_second": 6.398, "step": 15000 }, { "epoch": 4.658851818455065, "grad_norm": 2.2657737731933594, "learning_rate": 8.450000000000001e-05, "loss": 0.7208, "step": 15500 }, { "epoch": 4.658851818455065, "eval_accuracy": 0.8391741484354931, "eval_loss": 0.8659800291061401, "eval_runtime": 109.8658, "eval_samples_per_second": 217.083, "eval_steps_per_second": 6.79, "step": 15500 }, { "epoch": 4.809137360985873, "grad_norm": 8.90707015991211, "learning_rate": 8.4e-05, "loss": 0.7136, "step": 16000 }, { "epoch": 4.809137360985873, "eval_accuracy": 0.8399824907406062, "eval_loss": 0.8702093958854675, "eval_runtime": 107.8562, "eval_samples_per_second": 221.128, "eval_steps_per_second": 6.917, "step": 16000 }, { "epoch": 4.959422903516682, "grad_norm": 2.3318724632263184, "learning_rate": 8.35e-05, "loss": 0.7132, "step": 16500 }, { "epoch": 4.959422903516682, "eval_accuracy": 0.841569782708991, "eval_loss": 0.864676833152771, "eval_runtime": 107.4423, "eval_samples_per_second": 221.98, "eval_steps_per_second": 6.943, "step": 16500 }, { "epoch": 5.10970844604749, "grad_norm": 2.184828519821167, "learning_rate": 8.3e-05, "loss": 0.701, "step": 17000 }, { "epoch": 5.10970844604749, "eval_accuracy": 0.8416186120501093, "eval_loss": 0.8688568472862244, "eval_runtime": 116.7276, "eval_samples_per_second": 204.322, "eval_steps_per_second": 6.391, "step": 17000 }, { "epoch": 5.259993988578299, "grad_norm": 2.3987364768981934, "learning_rate": 8.25e-05, "loss": 0.7019, "step": 17500 }, { "epoch": 5.259993988578299, "eval_accuracy": 0.8420038130428066, "eval_loss": 0.8639406561851501, "eval_runtime": 116.7446, "eval_samples_per_second": 204.292, "eval_steps_per_second": 6.39, "step": 17500 }, { "epoch": 5.410279531109107, "grad_norm": 2.1173081398010254, "learning_rate": 8.2e-05, "loss": 0.6947, "step": 18000 }, { "epoch": 5.410279531109107, "eval_accuracy": 0.8435476925488244, "eval_loss": 0.854152262210846, "eval_runtime": 116.6203, "eval_samples_per_second": 204.51, "eval_steps_per_second": 6.397, "step": 18000 }, { "epoch": 5.560565073639916, "grad_norm": 2.004016637802124, "learning_rate": 8.15e-05, "loss": 0.695, "step": 18500 }, { "epoch": 5.560565073639916, "eval_accuracy": 0.8427755032205632, "eval_loss": 0.8532910943031311, "eval_runtime": 116.782, "eval_samples_per_second": 204.227, "eval_steps_per_second": 6.388, "step": 18500 }, { "epoch": 5.710850616170724, "grad_norm": 1.8693519830703735, "learning_rate": 8.1e-05, "loss": 0.6934, "step": 19000 }, { "epoch": 5.710850616170724, "eval_accuracy": 0.8446345300638607, "eval_loss": 0.8402228355407715, "eval_runtime": 116.5059, "eval_samples_per_second": 204.711, "eval_steps_per_second": 6.403, "step": 19000 }, { "epoch": 5.861136158701533, "grad_norm": 1.8139874935150146, "learning_rate": 8.05e-05, "loss": 0.6931, "step": 19500 }, { "epoch": 5.861136158701533, "eval_accuracy": 0.8450877334842727, "eval_loss": 0.8398534059524536, "eval_runtime": 116.5795, "eval_samples_per_second": 204.581, "eval_steps_per_second": 6.399, "step": 19500 }, { "epoch": 6.011421701232342, "grad_norm": 1.8472687005996704, "learning_rate": 8e-05, "loss": 0.6807, "step": 20000 }, { "epoch": 6.011421701232342, "eval_accuracy": 0.8455407828689585, "eval_loss": 0.8407430052757263, "eval_runtime": 116.7431, "eval_samples_per_second": 204.295, "eval_steps_per_second": 6.39, "step": 20000 }, { "epoch": 6.16170724376315, "grad_norm": 1.9816970825195312, "learning_rate": 7.950000000000001e-05, "loss": 0.6754, "step": 20500 }, { "epoch": 6.16170724376315, "eval_accuracy": 0.8462444588355045, "eval_loss": 0.8319252729415894, "eval_runtime": 107.7996, "eval_samples_per_second": 221.244, "eval_steps_per_second": 6.92, "step": 20500 }, { "epoch": 6.311992786293959, "grad_norm": 1.9578967094421387, "learning_rate": 7.900000000000001e-05, "loss": 0.6711, "step": 21000 }, { "epoch": 6.311992786293959, "eval_accuracy": 0.8468846608391637, "eval_loss": 0.8344280123710632, "eval_runtime": 116.54, "eval_samples_per_second": 204.651, "eval_steps_per_second": 6.401, "step": 21000 }, { "epoch": 6.462278328824767, "grad_norm": 1.8662471771240234, "learning_rate": 7.850000000000001e-05, "loss": 0.6703, "step": 21500 }, { "epoch": 6.462278328824767, "eval_accuracy": 0.8473787771402311, "eval_loss": 0.8395190238952637, "eval_runtime": 116.5278, "eval_samples_per_second": 204.672, "eval_steps_per_second": 6.402, "step": 21500 }, { "epoch": 6.612563871355576, "grad_norm": 1.848175048828125, "learning_rate": 7.800000000000001e-05, "loss": 0.6726, "step": 22000 }, { "epoch": 6.612563871355576, "eval_accuracy": 0.8479999094414559, "eval_loss": 0.830569863319397, "eval_runtime": 116.3409, "eval_samples_per_second": 205.001, "eval_steps_per_second": 6.412, "step": 22000 }, { "epoch": 6.762849413886384, "grad_norm": 3.2590267658233643, "learning_rate": 7.75e-05, "loss": 0.6673, "step": 22500 }, { "epoch": 6.762849413886384, "eval_accuracy": 0.8487053283097933, "eval_loss": 0.8218342065811157, "eval_runtime": 116.5286, "eval_samples_per_second": 204.671, "eval_steps_per_second": 6.402, "step": 22500 }, { "epoch": 6.913134956417193, "grad_norm": 2.029209613800049, "learning_rate": 7.7e-05, "loss": 0.6677, "step": 23000 }, { "epoch": 6.913134956417193, "eval_accuracy": 0.8491438026892206, "eval_loss": 0.8219966888427734, "eval_runtime": 116.719, "eval_samples_per_second": 204.337, "eval_steps_per_second": 6.391, "step": 23000 }, { "epoch": 7.063420498948001, "grad_norm": 1.9509702920913696, "learning_rate": 7.65e-05, "loss": 0.6599, "step": 23500 }, { "epoch": 7.063420498948001, "eval_accuracy": 0.8494577121885412, "eval_loss": 0.8129581212997437, "eval_runtime": 116.5906, "eval_samples_per_second": 204.562, "eval_steps_per_second": 6.398, "step": 23500 }, { "epoch": 7.21370604147881, "grad_norm": 1.8702620267868042, "learning_rate": 7.6e-05, "loss": 0.6532, "step": 24000 }, { "epoch": 7.21370604147881, "eval_accuracy": 0.8504569565947769, "eval_loss": 0.8200488686561584, "eval_runtime": 116.5808, "eval_samples_per_second": 204.579, "eval_steps_per_second": 6.399, "step": 24000 }, { "epoch": 7.363991584009618, "grad_norm": 2.0318052768707275, "learning_rate": 7.55e-05, "loss": 0.6575, "step": 24500 }, { "epoch": 7.363991584009618, "eval_accuracy": 0.8519663987722629, "eval_loss": 0.8060214519500732, "eval_runtime": 116.8049, "eval_samples_per_second": 204.187, "eval_steps_per_second": 6.387, "step": 24500 }, { "epoch": 7.514277126540427, "grad_norm": 1.6989022493362427, "learning_rate": 7.500000000000001e-05, "loss": 0.6468, "step": 25000 }, { "epoch": 7.514277126540427, "eval_accuracy": 0.8512699830765766, "eval_loss": 0.8081585764884949, "eval_runtime": 116.542, "eval_samples_per_second": 204.647, "eval_steps_per_second": 6.401, "step": 25000 }, { "epoch": 7.664562669071236, "grad_norm": 2.257707118988037, "learning_rate": 7.450000000000001e-05, "loss": 0.6439, "step": 25500 }, { "epoch": 7.664562669071236, "eval_accuracy": 0.8514294666694662, "eval_loss": 0.8061248660087585, "eval_runtime": 116.5436, "eval_samples_per_second": 204.644, "eval_steps_per_second": 6.401, "step": 25500 }, { "epoch": 7.814848211602044, "grad_norm": 1.899179458618164, "learning_rate": 7.4e-05, "loss": 0.6495, "step": 26000 }, { "epoch": 7.814848211602044, "eval_accuracy": 0.8526779437503333, "eval_loss": 0.8057689070701599, "eval_runtime": 116.3147, "eval_samples_per_second": 205.047, "eval_steps_per_second": 6.414, "step": 26000 }, { "epoch": 7.965133754132852, "grad_norm": 1.9314388036727905, "learning_rate": 7.35e-05, "loss": 0.6435, "step": 26500 }, { "epoch": 7.965133754132852, "eval_accuracy": 0.853362589989667, "eval_loss": 0.7940758466720581, "eval_runtime": 114.6596, "eval_samples_per_second": 208.007, "eval_steps_per_second": 6.506, "step": 26500 }, { "epoch": 8.115419296663662, "grad_norm": 2.094813585281372, "learning_rate": 7.3e-05, "loss": 0.6424, "step": 27000 }, { "epoch": 8.115419296663662, "eval_accuracy": 0.8536983778194274, "eval_loss": 0.8023838996887207, "eval_runtime": 116.5889, "eval_samples_per_second": 204.565, "eval_steps_per_second": 6.399, "step": 27000 }, { "epoch": 8.26570483919447, "grad_norm": 1.9136908054351807, "learning_rate": 7.25e-05, "loss": 0.6323, "step": 27500 }, { "epoch": 8.26570483919447, "eval_accuracy": 0.8540688828661784, "eval_loss": 0.7902230620384216, "eval_runtime": 116.3078, "eval_samples_per_second": 205.059, "eval_steps_per_second": 6.414, "step": 27500 }, { "epoch": 8.415990381725278, "grad_norm": 1.9303456544876099, "learning_rate": 7.2e-05, "loss": 0.6286, "step": 28000 }, { "epoch": 8.415990381725278, "eval_accuracy": 0.854171778943018, "eval_loss": 0.7997020483016968, "eval_runtime": 116.152, "eval_samples_per_second": 205.334, "eval_steps_per_second": 6.423, "step": 28000 }, { "epoch": 8.566275924256086, "grad_norm": 2.050676107406616, "learning_rate": 7.15e-05, "loss": 0.6312, "step": 28500 }, { "epoch": 8.566275924256086, "eval_accuracy": 0.8552772785456614, "eval_loss": 0.7872821688652039, "eval_runtime": 116.3758, "eval_samples_per_second": 204.94, "eval_steps_per_second": 6.41, "step": 28500 }, { "epoch": 8.716561466786896, "grad_norm": 2.050739049911499, "learning_rate": 7.1e-05, "loss": 0.6324, "step": 29000 }, { "epoch": 8.716561466786896, "eval_accuracy": 0.8549230288380387, "eval_loss": 0.7922600507736206, "eval_runtime": 109.588, "eval_samples_per_second": 217.633, "eval_steps_per_second": 6.807, "step": 29000 }, { "epoch": 8.866847009317704, "grad_norm": 1.623498558998108, "learning_rate": 7.05e-05, "loss": 0.6286, "step": 29500 }, { "epoch": 8.866847009317704, "eval_accuracy": 0.8564964285252138, "eval_loss": 0.7877017855644226, "eval_runtime": 116.4946, "eval_samples_per_second": 204.731, "eval_steps_per_second": 6.404, "step": 29500 }, { "epoch": 9.017132551848512, "grad_norm": 1.860737919807434, "learning_rate": 7e-05, "loss": 0.6291, "step": 30000 }, { "epoch": 9.017132551848512, "eval_accuracy": 0.8560715199440857, "eval_loss": 0.7842057347297668, "eval_runtime": 116.5722, "eval_samples_per_second": 204.594, "eval_steps_per_second": 6.399, "step": 30000 }, { "epoch": 9.16741809437932, "grad_norm": 2.1786551475524902, "learning_rate": 6.95e-05, "loss": 0.6148, "step": 30500 }, { "epoch": 9.16741809437932, "eval_accuracy": 0.8570739288667117, "eval_loss": 0.7951995730400085, "eval_runtime": 116.2116, "eval_samples_per_second": 205.229, "eval_steps_per_second": 6.419, "step": 30500 }, { "epoch": 9.31770363691013, "grad_norm": 1.7373932600021362, "learning_rate": 6.9e-05, "loss": 0.6217, "step": 31000 }, { "epoch": 9.31770363691013, "eval_accuracy": 0.8569885099114952, "eval_loss": 0.7805807590484619, "eval_runtime": 110.1302, "eval_samples_per_second": 216.562, "eval_steps_per_second": 6.774, "step": 31000 }, { "epoch": 9.467989179440938, "grad_norm": 1.7054003477096558, "learning_rate": 6.850000000000001e-05, "loss": 0.6175, "step": 31500 }, { "epoch": 9.467989179440938, "eval_accuracy": 0.8572684213885305, "eval_loss": 0.7795332074165344, "eval_runtime": 116.4206, "eval_samples_per_second": 204.861, "eval_steps_per_second": 6.408, "step": 31500 }, { "epoch": 9.618274721971746, "grad_norm": 1.7366029024124146, "learning_rate": 6.800000000000001e-05, "loss": 0.6128, "step": 32000 }, { "epoch": 9.618274721971746, "eval_accuracy": 0.8570074656746878, "eval_loss": 0.7861946225166321, "eval_runtime": 116.2566, "eval_samples_per_second": 205.15, "eval_steps_per_second": 6.417, "step": 32000 }, { "epoch": 9.768560264502554, "grad_norm": 1.7678993940353394, "learning_rate": 6.750000000000001e-05, "loss": 0.6151, "step": 32500 }, { "epoch": 9.768560264502554, "eval_accuracy": 0.8578313175257881, "eval_loss": 0.778949499130249, "eval_runtime": 116.2944, "eval_samples_per_second": 205.083, "eval_steps_per_second": 6.415, "step": 32500 }, { "epoch": 9.918845807033364, "grad_norm": 4.949440002441406, "learning_rate": 6.7e-05, "loss": 0.6191, "step": 33000 }, { "epoch": 9.918845807033364, "eval_accuracy": 0.8593022861653341, "eval_loss": 0.7685178518295288, "eval_runtime": 107.5299, "eval_samples_per_second": 221.799, "eval_steps_per_second": 6.938, "step": 33000 }, { "epoch": 10.069131349564172, "grad_norm": 1.8426265716552734, "learning_rate": 6.65e-05, "loss": 0.6058, "step": 33500 }, { "epoch": 10.069131349564172, "eval_accuracy": 0.8591416363855463, "eval_loss": 0.765252411365509, "eval_runtime": 107.5583, "eval_samples_per_second": 221.74, "eval_steps_per_second": 6.936, "step": 33500 }, { "epoch": 10.21941689209498, "grad_norm": 1.7789777517318726, "learning_rate": 6.6e-05, "loss": 0.6005, "step": 34000 }, { "epoch": 10.21941689209498, "eval_accuracy": 0.8592280951092771, "eval_loss": 0.7692108750343323, "eval_runtime": 116.5899, "eval_samples_per_second": 204.563, "eval_steps_per_second": 6.398, "step": 34000 }, { "epoch": 10.36970243462579, "grad_norm": 2.1639723777770996, "learning_rate": 6.55e-05, "loss": 0.6001, "step": 34500 }, { "epoch": 10.36970243462579, "eval_accuracy": 0.8602981608347727, "eval_loss": 0.7665141820907593, "eval_runtime": 116.3241, "eval_samples_per_second": 205.031, "eval_steps_per_second": 6.413, "step": 34500 }, { "epoch": 10.519987977156598, "grad_norm": 1.8134987354278564, "learning_rate": 6.500000000000001e-05, "loss": 0.5973, "step": 35000 }, { "epoch": 10.519987977156598, "eval_accuracy": 0.8608364562936837, "eval_loss": 0.7619218230247498, "eval_runtime": 116.2895, "eval_samples_per_second": 205.092, "eval_steps_per_second": 6.415, "step": 35000 }, { "epoch": 10.670273519687406, "grad_norm": 1.8694010972976685, "learning_rate": 6.450000000000001e-05, "loss": 0.6013, "step": 35500 }, { "epoch": 10.670273519687406, "eval_accuracy": 0.8605051108563309, "eval_loss": 0.7721201777458191, "eval_runtime": 116.3056, "eval_samples_per_second": 205.063, "eval_steps_per_second": 6.414, "step": 35500 }, { "epoch": 10.820559062218214, "grad_norm": 2.026700258255005, "learning_rate": 6.400000000000001e-05, "loss": 0.5987, "step": 36000 }, { "epoch": 10.820559062218214, "eval_accuracy": 0.8605780552644463, "eval_loss": 0.764506995677948, "eval_runtime": 107.3828, "eval_samples_per_second": 222.103, "eval_steps_per_second": 6.947, "step": 36000 }, { "epoch": 10.970844604749024, "grad_norm": 1.8263424634933472, "learning_rate": 6.35e-05, "loss": 0.5997, "step": 36500 }, { "epoch": 10.970844604749024, "eval_accuracy": 0.8617338751036818, "eval_loss": 0.7574514150619507, "eval_runtime": 116.4828, "eval_samples_per_second": 204.751, "eval_steps_per_second": 6.404, "step": 36500 }, { "epoch": 11.121130147279832, "grad_norm": 1.9417959451675415, "learning_rate": 6.3e-05, "loss": 0.596, "step": 37000 }, { "epoch": 11.121130147279832, "eval_accuracy": 0.862104334907601, "eval_loss": 0.7522989511489868, "eval_runtime": 110.8852, "eval_samples_per_second": 215.087, "eval_steps_per_second": 6.728, "step": 37000 }, { "epoch": 11.27141568981064, "grad_norm": 1.9351465702056885, "learning_rate": 6.25e-05, "loss": 0.5892, "step": 37500 }, { "epoch": 11.27141568981064, "eval_accuracy": 0.8626984941557923, "eval_loss": 0.7553607225418091, "eval_runtime": 116.8061, "eval_samples_per_second": 204.185, "eval_steps_per_second": 6.387, "step": 37500 }, { "epoch": 11.421701232341448, "grad_norm": 1.6293436288833618, "learning_rate": 6.2e-05, "loss": 0.5891, "step": 38000 }, { "epoch": 11.421701232341448, "eval_accuracy": 0.8628248618659347, "eval_loss": 0.7480278015136719, "eval_runtime": 116.3453, "eval_samples_per_second": 204.993, "eval_steps_per_second": 6.412, "step": 38000 }, { "epoch": 11.571986774872258, "grad_norm": 1.9017398357391357, "learning_rate": 6.15e-05, "loss": 0.5859, "step": 38500 }, { "epoch": 11.571986774872258, "eval_accuracy": 0.8624547125611713, "eval_loss": 0.7559540271759033, "eval_runtime": 116.8597, "eval_samples_per_second": 204.091, "eval_steps_per_second": 6.384, "step": 38500 }, { "epoch": 11.722272317403066, "grad_norm": 1.8014448881149292, "learning_rate": 6.1e-05, "loss": 0.585, "step": 39000 }, { "epoch": 11.722272317403066, "eval_accuracy": 0.8632527743188181, "eval_loss": 0.7535591721534729, "eval_runtime": 116.7403, "eval_samples_per_second": 204.3, "eval_steps_per_second": 6.39, "step": 39000 }, { "epoch": 11.872557859933874, "grad_norm": 1.713276743888855, "learning_rate": 6.05e-05, "loss": 0.5866, "step": 39500 }, { "epoch": 11.872557859933874, "eval_accuracy": 0.863832667737443, "eval_loss": 0.7468861937522888, "eval_runtime": 116.7273, "eval_samples_per_second": 204.322, "eval_steps_per_second": 6.391, "step": 39500 }, { "epoch": 12.022843402464684, "grad_norm": 1.8740110397338867, "learning_rate": 6e-05, "loss": 0.5788, "step": 40000 }, { "epoch": 12.022843402464684, "eval_accuracy": 0.8632054460239377, "eval_loss": 0.7444382309913635, "eval_runtime": 116.6273, "eval_samples_per_second": 204.498, "eval_steps_per_second": 6.396, "step": 40000 }, { "epoch": 12.173128944995492, "grad_norm": 1.8501290082931519, "learning_rate": 5.95e-05, "loss": 0.5783, "step": 40500 }, { "epoch": 12.173128944995492, "eval_accuracy": 0.8640383986565314, "eval_loss": 0.7565832138061523, "eval_runtime": 116.772, "eval_samples_per_second": 204.244, "eval_steps_per_second": 6.389, "step": 40500 }, { "epoch": 12.3234144875263, "grad_norm": 1.7049719095230103, "learning_rate": 5.9e-05, "loss": 0.5742, "step": 41000 }, { "epoch": 12.3234144875263, "eval_accuracy": 0.8644855392432329, "eval_loss": 0.7459850907325745, "eval_runtime": 116.6622, "eval_samples_per_second": 204.436, "eval_steps_per_second": 6.395, "step": 41000 }, { "epoch": 12.473700030057108, "grad_norm": 2.143068552017212, "learning_rate": 5.85e-05, "loss": 0.5745, "step": 41500 }, { "epoch": 12.473700030057108, "eval_accuracy": 0.8649068423261972, "eval_loss": 0.7483195066452026, "eval_runtime": 116.6586, "eval_samples_per_second": 204.443, "eval_steps_per_second": 6.395, "step": 41500 }, { "epoch": 12.623985572587918, "grad_norm": 1.9698896408081055, "learning_rate": 5.8e-05, "loss": 0.5761, "step": 42000 }, { "epoch": 12.623985572587918, "eval_accuracy": 0.8654425852392968, "eval_loss": 0.748586118221283, "eval_runtime": 116.7681, "eval_samples_per_second": 204.251, "eval_steps_per_second": 6.389, "step": 42000 }, { "epoch": 12.774271115118726, "grad_norm": 1.8358840942382812, "learning_rate": 5.7499999999999995e-05, "loss": 0.5724, "step": 42500 }, { "epoch": 12.774271115118726, "eval_accuracy": 0.8659325247593141, "eval_loss": 0.7420374155044556, "eval_runtime": 116.7322, "eval_samples_per_second": 204.314, "eval_steps_per_second": 6.391, "step": 42500 }, { "epoch": 12.924556657649534, "grad_norm": 2.802210807800293, "learning_rate": 5.6999999999999996e-05, "loss": 0.5735, "step": 43000 }, { "epoch": 12.924556657649534, "eval_accuracy": 0.8659312905140033, "eval_loss": 0.7376499176025391, "eval_runtime": 116.9285, "eval_samples_per_second": 203.971, "eval_steps_per_second": 6.38, "step": 43000 }, { "epoch": 13.074842200180342, "grad_norm": 1.8747409582138062, "learning_rate": 5.65e-05, "loss": 0.5641, "step": 43500 }, { "epoch": 13.074842200180342, "eval_accuracy": 0.8668957019102801, "eval_loss": 0.7305882573127747, "eval_runtime": 107.5831, "eval_samples_per_second": 221.689, "eval_steps_per_second": 6.934, "step": 43500 }, { "epoch": 13.225127742711152, "grad_norm": 1.808056354522705, "learning_rate": 5.6000000000000006e-05, "loss": 0.5624, "step": 44000 }, { "epoch": 13.225127742711152, "eval_accuracy": 0.8658279568155942, "eval_loss": 0.739223837852478, "eval_runtime": 108.044, "eval_samples_per_second": 220.743, "eval_steps_per_second": 6.905, "step": 44000 }, { "epoch": 13.37541328524196, "grad_norm": 1.8610390424728394, "learning_rate": 5.550000000000001e-05, "loss": 0.567, "step": 44500 }, { "epoch": 13.37541328524196, "eval_accuracy": 0.8667378631201024, "eval_loss": 0.7321043014526367, "eval_runtime": 109.9862, "eval_samples_per_second": 216.845, "eval_steps_per_second": 6.783, "step": 44500 }, { "epoch": 13.525698827772768, "grad_norm": 2.11275053024292, "learning_rate": 5.500000000000001e-05, "loss": 0.5605, "step": 45000 }, { "epoch": 13.525698827772768, "eval_accuracy": 0.8663817862037386, "eval_loss": 0.7416894435882568, "eval_runtime": 116.6588, "eval_samples_per_second": 204.442, "eval_steps_per_second": 6.395, "step": 45000 }, { "epoch": 13.675984370303578, "grad_norm": 1.9117276668548584, "learning_rate": 5.45e-05, "loss": 0.5631, "step": 45500 }, { "epoch": 13.675984370303578, "eval_accuracy": 0.8677395573035557, "eval_loss": 0.741428017616272, "eval_runtime": 116.6652, "eval_samples_per_second": 204.431, "eval_steps_per_second": 6.394, "step": 45500 }, { "epoch": 13.826269912834386, "grad_norm": 1.6430315971374512, "learning_rate": 5.4000000000000005e-05, "loss": 0.5641, "step": 46000 }, { "epoch": 13.826269912834386, "eval_accuracy": 0.8683863736931012, "eval_loss": 0.7305155396461487, "eval_runtime": 116.5721, "eval_samples_per_second": 204.594, "eval_steps_per_second": 6.399, "step": 46000 }, { "epoch": 13.976555455365194, "grad_norm": 2.08204984664917, "learning_rate": 5.3500000000000006e-05, "loss": 0.5608, "step": 46500 }, { "epoch": 13.976555455365194, "eval_accuracy": 0.8681003584229391, "eval_loss": 0.7270543575286865, "eval_runtime": 116.8818, "eval_samples_per_second": 204.052, "eval_steps_per_second": 6.383, "step": 46500 }, { "epoch": 14.126840997896002, "grad_norm": 1.8439579010009766, "learning_rate": 5.300000000000001e-05, "loss": 0.5527, "step": 47000 }, { "epoch": 14.126840997896002, "eval_accuracy": 0.8685334955538806, "eval_loss": 0.7283613681793213, "eval_runtime": 116.9737, "eval_samples_per_second": 203.892, "eval_steps_per_second": 6.378, "step": 47000 }, { "epoch": 14.277126540426812, "grad_norm": 1.885826826095581, "learning_rate": 5.25e-05, "loss": 0.5545, "step": 47500 }, { "epoch": 14.277126540426812, "eval_accuracy": 0.868743758995724, "eval_loss": 0.714011549949646, "eval_runtime": 117.0856, "eval_samples_per_second": 203.697, "eval_steps_per_second": 6.371, "step": 47500 }, { "epoch": 14.42741208295762, "grad_norm": 2.082005023956299, "learning_rate": 5.2000000000000004e-05, "loss": 0.5531, "step": 48000 }, { "epoch": 14.42741208295762, "eval_accuracy": 0.8684484035004348, "eval_loss": 0.7312475442886353, "eval_runtime": 117.0945, "eval_samples_per_second": 203.682, "eval_steps_per_second": 6.371, "step": 48000 }, { "epoch": 14.577697625488428, "grad_norm": 1.7705485820770264, "learning_rate": 5.1500000000000005e-05, "loss": 0.5472, "step": 48500 }, { "epoch": 14.577697625488428, "eval_accuracy": 0.8695177359118011, "eval_loss": 0.7208582162857056, "eval_runtime": 116.7861, "eval_samples_per_second": 204.219, "eval_steps_per_second": 6.388, "step": 48500 }, { "epoch": 14.727983168019236, "grad_norm": 1.748723030090332, "learning_rate": 5.1000000000000006e-05, "loss": 0.5478, "step": 49000 }, { "epoch": 14.727983168019236, "eval_accuracy": 0.8689573785332655, "eval_loss": 0.7217094302177429, "eval_runtime": 107.6929, "eval_samples_per_second": 221.463, "eval_steps_per_second": 6.927, "step": 49000 }, { "epoch": 14.878268710550046, "grad_norm": 2.0606579780578613, "learning_rate": 5.05e-05, "loss": 0.5478, "step": 49500 }, { "epoch": 14.878268710550046, "eval_accuracy": 0.8699593453930161, "eval_loss": 0.7098901271820068, "eval_runtime": 116.9388, "eval_samples_per_second": 203.953, "eval_steps_per_second": 6.379, "step": 49500 }, { "epoch": 15.028554253080854, "grad_norm": 1.8069385290145874, "learning_rate": 5e-05, "loss": 0.5509, "step": 50000 }, { "epoch": 15.028554253080854, "eval_accuracy": 0.8700326307006707, "eval_loss": 0.7232961654663086, "eval_runtime": 116.7858, "eval_samples_per_second": 204.22, "eval_steps_per_second": 6.388, "step": 50000 }, { "epoch": 15.178839795611662, "grad_norm": 1.7858914136886597, "learning_rate": 4.9500000000000004e-05, "loss": 0.538, "step": 50500 }, { "epoch": 15.178839795611662, "eval_accuracy": 0.8703979339691097, "eval_loss": 0.7135134935379028, "eval_runtime": 116.6054, "eval_samples_per_second": 204.536, "eval_steps_per_second": 6.398, "step": 50500 }, { "epoch": 15.32912533814247, "grad_norm": 1.7109432220458984, "learning_rate": 4.9e-05, "loss": 0.5446, "step": 51000 }, { "epoch": 15.32912533814247, "eval_accuracy": 0.8702478491677152, "eval_loss": 0.7196989059448242, "eval_runtime": 107.9173, "eval_samples_per_second": 221.003, "eval_steps_per_second": 6.913, "step": 51000 }, { "epoch": 15.47941088067328, "grad_norm": 1.9177416563034058, "learning_rate": 4.85e-05, "loss": 0.539, "step": 51500 }, { "epoch": 15.47941088067328, "eval_accuracy": 0.8703270612937091, "eval_loss": 0.7177140712738037, "eval_runtime": 116.8402, "eval_samples_per_second": 204.125, "eval_steps_per_second": 6.385, "step": 51500 }, { "epoch": 15.629696423204088, "grad_norm": 1.9683233499526978, "learning_rate": 4.8e-05, "loss": 0.5417, "step": 52000 }, { "epoch": 15.629696423204088, "eval_accuracy": 0.8706941705195771, "eval_loss": 0.7134575843811035, "eval_runtime": 116.8038, "eval_samples_per_second": 204.189, "eval_steps_per_second": 6.387, "step": 52000 }, { "epoch": 15.779981965734896, "grad_norm": 2.3991804122924805, "learning_rate": 4.75e-05, "loss": 0.5399, "step": 52500 }, { "epoch": 15.779981965734896, "eval_accuracy": 0.8716151455126488, "eval_loss": 0.713813066482544, "eval_runtime": 116.7969, "eval_samples_per_second": 204.201, "eval_steps_per_second": 6.387, "step": 52500 }, { "epoch": 15.930267508265704, "grad_norm": 1.9599590301513672, "learning_rate": 4.7e-05, "loss": 0.5398, "step": 53000 }, { "epoch": 15.930267508265704, "eval_accuracy": 0.871747411710409, "eval_loss": 0.7075196504592896, "eval_runtime": 116.7892, "eval_samples_per_second": 204.214, "eval_steps_per_second": 6.388, "step": 53000 }, { "epoch": 16.080553050796514, "grad_norm": 1.9201576709747314, "learning_rate": 4.6500000000000005e-05, "loss": 0.5408, "step": 53500 }, { "epoch": 16.080553050796514, "eval_accuracy": 0.8715354416875233, "eval_loss": 0.7076723575592041, "eval_runtime": 116.6515, "eval_samples_per_second": 204.455, "eval_steps_per_second": 6.395, "step": 53500 }, { "epoch": 16.230838593327324, "grad_norm": 1.6085834503173828, "learning_rate": 4.600000000000001e-05, "loss": 0.536, "step": 54000 }, { "epoch": 16.230838593327324, "eval_accuracy": 0.8717389564175397, "eval_loss": 0.7048630714416504, "eval_runtime": 116.4799, "eval_samples_per_second": 204.756, "eval_steps_per_second": 6.405, "step": 54000 }, { "epoch": 16.38112413585813, "grad_norm": 1.6757999658584595, "learning_rate": 4.55e-05, "loss": 0.5315, "step": 54500 }, { "epoch": 16.38112413585813, "eval_accuracy": 0.8721356793847945, "eval_loss": 0.7077359557151794, "eval_runtime": 116.7841, "eval_samples_per_second": 204.223, "eval_steps_per_second": 6.388, "step": 54500 }, { "epoch": 16.53140967838894, "grad_norm": 1.672525405883789, "learning_rate": 4.5e-05, "loss": 0.53, "step": 55000 }, { "epoch": 16.53140967838894, "eval_accuracy": 0.8728707770815839, "eval_loss": 0.6990543007850647, "eval_runtime": 108.2343, "eval_samples_per_second": 220.355, "eval_steps_per_second": 6.892, "step": 55000 }, { "epoch": 16.681695220919746, "grad_norm": 1.9406942129135132, "learning_rate": 4.4500000000000004e-05, "loss": 0.5306, "step": 55500 }, { "epoch": 16.681695220919746, "eval_accuracy": 0.8732208974146737, "eval_loss": 0.6984574794769287, "eval_runtime": 107.8142, "eval_samples_per_second": 221.214, "eval_steps_per_second": 6.919, "step": 55500 }, { "epoch": 16.831980763450556, "grad_norm": 1.8891712427139282, "learning_rate": 4.4000000000000006e-05, "loss": 0.525, "step": 56000 }, { "epoch": 16.831980763450556, "eval_accuracy": 0.8726220670882622, "eval_loss": 0.7078565359115601, "eval_runtime": 116.9593, "eval_samples_per_second": 203.917, "eval_steps_per_second": 6.378, "step": 56000 }, { "epoch": 16.982266305981366, "grad_norm": 1.714554786682129, "learning_rate": 4.35e-05, "loss": 0.5299, "step": 56500 }, { "epoch": 16.982266305981366, "eval_accuracy": 0.8730728804514805, "eval_loss": 0.6982918381690979, "eval_runtime": 116.7035, "eval_samples_per_second": 204.364, "eval_steps_per_second": 6.392, "step": 56500 }, { "epoch": 17.132551848512172, "grad_norm": 1.8683373928070068, "learning_rate": 4.3e-05, "loss": 0.5223, "step": 57000 }, { "epoch": 17.132551848512172, "eval_accuracy": 0.872708599649957, "eval_loss": 0.7026687860488892, "eval_runtime": 116.6959, "eval_samples_per_second": 204.377, "eval_steps_per_second": 6.393, "step": 57000 }, { "epoch": 17.282837391042982, "grad_norm": 1.8246701955795288, "learning_rate": 4.25e-05, "loss": 0.5273, "step": 57500 }, { "epoch": 17.282837391042982, "eval_accuracy": 0.8739937126999742, "eval_loss": 0.6949180960655212, "eval_runtime": 117.0722, "eval_samples_per_second": 203.72, "eval_steps_per_second": 6.372, "step": 57500 }, { "epoch": 17.43312293357379, "grad_norm": 1.7169808149337769, "learning_rate": 4.2e-05, "loss": 0.524, "step": 58000 }, { "epoch": 17.43312293357379, "eval_accuracy": 0.8741740693329195, "eval_loss": 0.694256603717804, "eval_runtime": 107.8207, "eval_samples_per_second": 221.201, "eval_steps_per_second": 6.919, "step": 58000 }, { "epoch": 17.583408476104598, "grad_norm": 1.6059890985488892, "learning_rate": 4.15e-05, "loss": 0.5147, "step": 58500 }, { "epoch": 17.583408476104598, "eval_accuracy": 0.8736431611771553, "eval_loss": 0.6980590224266052, "eval_runtime": 116.8023, "eval_samples_per_second": 204.191, "eval_steps_per_second": 6.387, "step": 58500 }, { "epoch": 17.733694018635408, "grad_norm": 1.7200227975845337, "learning_rate": 4.1e-05, "loss": 0.5232, "step": 59000 }, { "epoch": 17.733694018635408, "eval_accuracy": 0.8748486202324967, "eval_loss": 0.7009324431419373, "eval_runtime": 116.819, "eval_samples_per_second": 204.162, "eval_steps_per_second": 6.386, "step": 59000 }, { "epoch": 17.883979561166214, "grad_norm": 1.703254222869873, "learning_rate": 4.05e-05, "loss": 0.5261, "step": 59500 }, { "epoch": 17.883979561166214, "eval_accuracy": 0.8746740318645458, "eval_loss": 0.69361412525177, "eval_runtime": 107.6767, "eval_samples_per_second": 221.496, "eval_steps_per_second": 6.928, "step": 59500 }, { "epoch": 18.034265103697024, "grad_norm": 1.6703375577926636, "learning_rate": 4e-05, "loss": 0.5145, "step": 60000 }, { "epoch": 18.034265103697024, "eval_accuracy": 0.875889631936868, "eval_loss": 0.6909291744232178, "eval_runtime": 116.8223, "eval_samples_per_second": 204.156, "eval_steps_per_second": 6.386, "step": 60000 }, { "epoch": 18.184550646227834, "grad_norm": 1.8288097381591797, "learning_rate": 3.9500000000000005e-05, "loss": 0.5124, "step": 60500 }, { "epoch": 18.184550646227834, "eval_accuracy": 0.8756345485458439, "eval_loss": 0.69134122133255, "eval_runtime": 116.7517, "eval_samples_per_second": 204.28, "eval_steps_per_second": 6.39, "step": 60500 }, { "epoch": 18.33483618875864, "grad_norm": 1.662782907485962, "learning_rate": 3.9000000000000006e-05, "loss": 0.5164, "step": 61000 }, { "epoch": 18.33483618875864, "eval_accuracy": 0.875405226328803, "eval_loss": 0.6911923289299011, "eval_runtime": 116.9064, "eval_samples_per_second": 204.009, "eval_steps_per_second": 6.381, "step": 61000 }, { "epoch": 18.48512173128945, "grad_norm": 1.9634568691253662, "learning_rate": 3.85e-05, "loss": 0.5149, "step": 61500 }, { "epoch": 18.48512173128945, "eval_accuracy": 0.8751683629469087, "eval_loss": 0.6914454102516174, "eval_runtime": 116.9817, "eval_samples_per_second": 203.878, "eval_steps_per_second": 6.377, "step": 61500 }, { "epoch": 18.63540727382026, "grad_norm": 1.8682461977005005, "learning_rate": 3.8e-05, "loss": 0.513, "step": 62000 }, { "epoch": 18.63540727382026, "eval_accuracy": 0.8760635771091384, "eval_loss": 0.68807452917099, "eval_runtime": 116.6504, "eval_samples_per_second": 204.457, "eval_steps_per_second": 6.395, "step": 62000 }, { "epoch": 18.785692816351066, "grad_norm": 1.5234293937683105, "learning_rate": 3.7500000000000003e-05, "loss": 0.509, "step": 62500 }, { "epoch": 18.785692816351066, "eval_accuracy": 0.8759613115538415, "eval_loss": 0.6957583427429199, "eval_runtime": 116.7007, "eval_samples_per_second": 204.369, "eval_steps_per_second": 6.392, "step": 62500 }, { "epoch": 18.935978358881876, "grad_norm": 1.7746226787567139, "learning_rate": 3.7e-05, "loss": 0.512, "step": 63000 }, { "epoch": 18.935978358881876, "eval_accuracy": 0.876876176581895, "eval_loss": 0.6864527463912964, "eval_runtime": 115.6252, "eval_samples_per_second": 206.27, "eval_steps_per_second": 6.452, "step": 63000 }, { "epoch": 19.086263901412686, "grad_norm": 1.646492600440979, "learning_rate": 3.65e-05, "loss": 0.5046, "step": 63500 }, { "epoch": 19.086263901412686, "eval_accuracy": 0.8768463424161963, "eval_loss": 0.6872249841690063, "eval_runtime": 114.9297, "eval_samples_per_second": 207.518, "eval_steps_per_second": 6.491, "step": 63500 }, { "epoch": 19.236549443943492, "grad_norm": 1.8159064054489136, "learning_rate": 3.6e-05, "loss": 0.5084, "step": 64000 }, { "epoch": 19.236549443943492, "eval_accuracy": 0.8773699378741395, "eval_loss": 0.6870805621147156, "eval_runtime": 116.5459, "eval_samples_per_second": 204.64, "eval_steps_per_second": 6.401, "step": 64000 }, { "epoch": 19.3868349864743, "grad_norm": 2.0112156867980957, "learning_rate": 3.55e-05, "loss": 0.5061, "step": 64500 }, { "epoch": 19.3868349864743, "eval_accuracy": 0.8768580323936667, "eval_loss": 0.6853751540184021, "eval_runtime": 116.8564, "eval_samples_per_second": 204.097, "eval_steps_per_second": 6.384, "step": 64500 }, { "epoch": 19.537120529005108, "grad_norm": 1.6730468273162842, "learning_rate": 3.5e-05, "loss": 0.5046, "step": 65000 }, { "epoch": 19.537120529005108, "eval_accuracy": 0.8771652233862289, "eval_loss": 0.6863858699798584, "eval_runtime": 116.6915, "eval_samples_per_second": 204.385, "eval_steps_per_second": 6.393, "step": 65000 }, { "epoch": 19.687406071535918, "grad_norm": 2.0271100997924805, "learning_rate": 3.45e-05, "loss": 0.5001, "step": 65500 }, { "epoch": 19.687406071535918, "eval_accuracy": 0.8773539501388448, "eval_loss": 0.6785492897033691, "eval_runtime": 116.8739, "eval_samples_per_second": 204.066, "eval_steps_per_second": 6.383, "step": 65500 }, { "epoch": 19.837691614066728, "grad_norm": 1.8729721307754517, "learning_rate": 3.4000000000000007e-05, "loss": 0.505, "step": 66000 }, { "epoch": 19.837691614066728, "eval_accuracy": 0.8780705308130461, "eval_loss": 0.6714682579040527, "eval_runtime": 116.8211, "eval_samples_per_second": 204.158, "eval_steps_per_second": 6.386, "step": 66000 }, { "epoch": 19.987977156597534, "grad_norm": 1.9362084865570068, "learning_rate": 3.35e-05, "loss": 0.5002, "step": 66500 }, { "epoch": 19.987977156597534, "eval_accuracy": 0.8776805950009898, "eval_loss": 0.6802576184272766, "eval_runtime": 116.6312, "eval_samples_per_second": 204.491, "eval_steps_per_second": 6.396, "step": 66500 }, { "epoch": 20.138262699128344, "grad_norm": 1.7120035886764526, "learning_rate": 3.3e-05, "loss": 0.4987, "step": 67000 }, { "epoch": 20.138262699128344, "eval_accuracy": 0.878232828328568, "eval_loss": 0.6773367524147034, "eval_runtime": 116.7533, "eval_samples_per_second": 204.277, "eval_steps_per_second": 6.39, "step": 67000 }, { "epoch": 20.288548241659154, "grad_norm": 1.8688651323318481, "learning_rate": 3.2500000000000004e-05, "loss": 0.4989, "step": 67500 }, { "epoch": 20.288548241659154, "eval_accuracy": 0.8787014026112573, "eval_loss": 0.6670864820480347, "eval_runtime": 108.5708, "eval_samples_per_second": 219.672, "eval_steps_per_second": 6.871, "step": 67500 }, { "epoch": 20.43883378418996, "grad_norm": 1.9278781414031982, "learning_rate": 3.2000000000000005e-05, "loss": 0.4952, "step": 68000 }, { "epoch": 20.43883378418996, "eval_accuracy": 0.878076460127134, "eval_loss": 0.6784164309501648, "eval_runtime": 110.0403, "eval_samples_per_second": 216.739, "eval_steps_per_second": 6.779, "step": 68000 }, { "epoch": 20.58911932672077, "grad_norm": 1.9274306297302246, "learning_rate": 3.15e-05, "loss": 0.4981, "step": 68500 }, { "epoch": 20.58911932672077, "eval_accuracy": 0.8784909065521413, "eval_loss": 0.6816439032554626, "eval_runtime": 116.9304, "eval_samples_per_second": 203.967, "eval_steps_per_second": 6.38, "step": 68500 }, { "epoch": 20.73940486925158, "grad_norm": 3.294536828994751, "learning_rate": 3.1e-05, "loss": 0.498, "step": 69000 }, { "epoch": 20.73940486925158, "eval_accuracy": 0.8785106119785877, "eval_loss": 0.6759437918663025, "eval_runtime": 116.4604, "eval_samples_per_second": 204.791, "eval_steps_per_second": 6.406, "step": 69000 }, { "epoch": 20.889690411782386, "grad_norm": 1.8872230052947998, "learning_rate": 3.05e-05, "loss": 0.4936, "step": 69500 }, { "epoch": 20.889690411782386, "eval_accuracy": 0.8788254121537933, "eval_loss": 0.6709563732147217, "eval_runtime": 116.5595, "eval_samples_per_second": 204.617, "eval_steps_per_second": 6.4, "step": 69500 }, { "epoch": 21.039975954313196, "grad_norm": 1.5999741554260254, "learning_rate": 3e-05, "loss": 0.496, "step": 70000 }, { "epoch": 21.039975954313196, "eval_accuracy": 0.8790234728771115, "eval_loss": 0.6730785965919495, "eval_runtime": 117.0812, "eval_samples_per_second": 203.705, "eval_steps_per_second": 6.372, "step": 70000 }, { "epoch": 21.190261496844002, "grad_norm": 1.8360600471496582, "learning_rate": 2.95e-05, "loss": 0.491, "step": 70500 }, { "epoch": 21.190261496844002, "eval_accuracy": 0.879316152326948, "eval_loss": 0.6717950701713562, "eval_runtime": 116.8552, "eval_samples_per_second": 204.099, "eval_steps_per_second": 6.384, "step": 70500 }, { "epoch": 21.340547039374812, "grad_norm": 1.9219287633895874, "learning_rate": 2.9e-05, "loss": 0.4884, "step": 71000 }, { "epoch": 21.340547039374812, "eval_accuracy": 0.8798596044734723, "eval_loss": 0.6683239340782166, "eval_runtime": 115.4453, "eval_samples_per_second": 206.591, "eval_steps_per_second": 6.462, "step": 71000 }, { "epoch": 21.49083258190562, "grad_norm": 1.7819031476974487, "learning_rate": 2.8499999999999998e-05, "loss": 0.4871, "step": 71500 }, { "epoch": 21.49083258190562, "eval_accuracy": 0.8798842140845434, "eval_loss": 0.6718733310699463, "eval_runtime": 114.7124, "eval_samples_per_second": 207.911, "eval_steps_per_second": 6.503, "step": 71500 }, { "epoch": 21.641118124436428, "grad_norm": 1.7548120021820068, "learning_rate": 2.8000000000000003e-05, "loss": 0.4882, "step": 72000 }, { "epoch": 21.641118124436428, "eval_accuracy": 0.8806871714993016, "eval_loss": 0.6676328182220459, "eval_runtime": 115.22, "eval_samples_per_second": 206.995, "eval_steps_per_second": 6.475, "step": 72000 }, { "epoch": 21.791403666967238, "grad_norm": 1.7839481830596924, "learning_rate": 2.7500000000000004e-05, "loss": 0.4884, "step": 72500 }, { "epoch": 21.791403666967238, "eval_accuracy": 0.8799460097311802, "eval_loss": 0.6682041883468628, "eval_runtime": 115.318, "eval_samples_per_second": 206.819, "eval_steps_per_second": 6.469, "step": 72500 }, { "epoch": 21.941689209498048, "grad_norm": 1.6928880214691162, "learning_rate": 2.7000000000000002e-05, "loss": 0.4866, "step": 73000 }, { "epoch": 21.941689209498048, "eval_accuracy": 0.8800219560265089, "eval_loss": 0.6632567048072815, "eval_runtime": 115.6044, "eval_samples_per_second": 206.307, "eval_steps_per_second": 6.453, "step": 73000 }, { "epoch": 22.091974752028854, "grad_norm": 1.7279243469238281, "learning_rate": 2.6500000000000004e-05, "loss": 0.4834, "step": 73500 }, { "epoch": 22.091974752028854, "eval_accuracy": 0.8804642416122549, "eval_loss": 0.6664032340049744, "eval_runtime": 112.765, "eval_samples_per_second": 211.502, "eval_steps_per_second": 6.616, "step": 73500 }, { "epoch": 22.242260294559664, "grad_norm": 1.9567474126815796, "learning_rate": 2.6000000000000002e-05, "loss": 0.4834, "step": 74000 }, { "epoch": 22.242260294559664, "eval_accuracy": 0.8807840378465421, "eval_loss": 0.6598703861236572, "eval_runtime": 117.0371, "eval_samples_per_second": 203.782, "eval_steps_per_second": 6.374, "step": 74000 }, { "epoch": 22.392545837090474, "grad_norm": 2.2625935077667236, "learning_rate": 2.5500000000000003e-05, "loss": 0.485, "step": 74500 }, { "epoch": 22.392545837090474, "eval_accuracy": 0.8811392723400092, "eval_loss": 0.6655657291412354, "eval_runtime": 117.018, "eval_samples_per_second": 203.815, "eval_steps_per_second": 6.375, "step": 74500 }, { "epoch": 22.54283137962128, "grad_norm": 1.8813621997833252, "learning_rate": 2.5e-05, "loss": 0.4791, "step": 75000 }, { "epoch": 22.54283137962128, "eval_accuracy": 0.8814205475282593, "eval_loss": 0.6609296202659607, "eval_runtime": 115.2619, "eval_samples_per_second": 206.92, "eval_steps_per_second": 6.472, "step": 75000 }, { "epoch": 22.69311692215209, "grad_norm": 1.826451301574707, "learning_rate": 2.45e-05, "loss": 0.481, "step": 75500 }, { "epoch": 22.69311692215209, "eval_accuracy": 0.8811061336424764, "eval_loss": 0.6661359071731567, "eval_runtime": 115.391, "eval_samples_per_second": 206.689, "eval_steps_per_second": 6.465, "step": 75500 }, { "epoch": 22.843402464682896, "grad_norm": 1.6542744636535645, "learning_rate": 2.4e-05, "loss": 0.477, "step": 76000 }, { "epoch": 22.843402464682896, "eval_accuracy": 0.8812821051271282, "eval_loss": 0.6694051623344421, "eval_runtime": 115.3645, "eval_samples_per_second": 206.736, "eval_steps_per_second": 6.466, "step": 76000 }, { "epoch": 22.993688007213706, "grad_norm": 1.682438611984253, "learning_rate": 2.35e-05, "loss": 0.4815, "step": 76500 }, { "epoch": 22.993688007213706, "eval_accuracy": 0.8817553484690157, "eval_loss": 0.6549589037895203, "eval_runtime": 116.9114, "eval_samples_per_second": 204.001, "eval_steps_per_second": 6.381, "step": 76500 }, { "epoch": 23.143973549744516, "grad_norm": 1.7434841394424438, "learning_rate": 2.3000000000000003e-05, "loss": 0.4731, "step": 77000 }, { "epoch": 23.143973549744516, "eval_accuracy": 0.8812738970945014, "eval_loss": 0.6668951511383057, "eval_runtime": 116.8054, "eval_samples_per_second": 204.186, "eval_steps_per_second": 6.387, "step": 77000 }, { "epoch": 23.294259092275322, "grad_norm": 1.9547364711761475, "learning_rate": 2.25e-05, "loss": 0.4771, "step": 77500 }, { "epoch": 23.294259092275322, "eval_accuracy": 0.8824330755550743, "eval_loss": 0.6548491716384888, "eval_runtime": 117.0851, "eval_samples_per_second": 203.698, "eval_steps_per_second": 6.371, "step": 77500 }, { "epoch": 23.44454463480613, "grad_norm": 1.7944780588150024, "learning_rate": 2.2000000000000003e-05, "loss": 0.4755, "step": 78000 }, { "epoch": 23.44454463480613, "eval_accuracy": 0.8817643623427576, "eval_loss": 0.6568784713745117, "eval_runtime": 115.137, "eval_samples_per_second": 207.145, "eval_steps_per_second": 6.479, "step": 78000 }, { "epoch": 23.59483017733694, "grad_norm": 1.7111644744873047, "learning_rate": 2.15e-05, "loss": 0.4719, "step": 78500 }, { "epoch": 23.59483017733694, "eval_accuracy": 0.8824214259033778, "eval_loss": 0.6564775705337524, "eval_runtime": 107.7631, "eval_samples_per_second": 221.319, "eval_steps_per_second": 6.923, "step": 78500 }, { "epoch": 23.745115719867748, "grad_norm": 2.02797532081604, "learning_rate": 2.1e-05, "loss": 0.471, "step": 79000 }, { "epoch": 23.745115719867748, "eval_accuracy": 0.8821614578070651, "eval_loss": 0.6596437096595764, "eval_runtime": 116.8663, "eval_samples_per_second": 204.079, "eval_steps_per_second": 6.383, "step": 79000 }, { "epoch": 23.895401262398558, "grad_norm": 1.6693309545516968, "learning_rate": 2.05e-05, "loss": 0.4766, "step": 79500 }, { "epoch": 23.895401262398558, "eval_accuracy": 0.8834657821482969, "eval_loss": 0.6477739810943604, "eval_runtime": 107.672, "eval_samples_per_second": 221.506, "eval_steps_per_second": 6.928, "step": 79500 }, { "epoch": 24.045686804929367, "grad_norm": 1.656769871711731, "learning_rate": 2e-05, "loss": 0.4712, "step": 80000 }, { "epoch": 24.045686804929367, "eval_accuracy": 0.883265155023816, "eval_loss": 0.6536691188812256, "eval_runtime": 116.9432, "eval_samples_per_second": 203.945, "eval_steps_per_second": 6.379, "step": 80000 }, { "epoch": 24.195972347460174, "grad_norm": 1.7409459352493286, "learning_rate": 1.9500000000000003e-05, "loss": 0.4701, "step": 80500 }, { "epoch": 24.195972347460174, "eval_accuracy": 0.8829075931961291, "eval_loss": 0.6544816493988037, "eval_runtime": 116.833, "eval_samples_per_second": 204.137, "eval_steps_per_second": 6.385, "step": 80500 }, { "epoch": 24.346257889990984, "grad_norm": 2.0059051513671875, "learning_rate": 1.9e-05, "loss": 0.4699, "step": 81000 }, { "epoch": 24.346257889990984, "eval_accuracy": 0.8832343216189223, "eval_loss": 0.6505429148674011, "eval_runtime": 115.2564, "eval_samples_per_second": 206.93, "eval_steps_per_second": 6.473, "step": 81000 }, { "epoch": 24.49654343252179, "grad_norm": 1.7205955982208252, "learning_rate": 1.85e-05, "loss": 0.4687, "step": 81500 }, { "epoch": 24.49654343252179, "eval_accuracy": 0.8837817314915974, "eval_loss": 0.6520202159881592, "eval_runtime": 113.182, "eval_samples_per_second": 210.723, "eval_steps_per_second": 6.591, "step": 81500 }, { "epoch": 24.6468289750526, "grad_norm": 1.6017646789550781, "learning_rate": 1.8e-05, "loss": 0.4696, "step": 82000 }, { "epoch": 24.6468289750526, "eval_accuracy": 0.8834150280615335, "eval_loss": 0.6517773866653442, "eval_runtime": 108.314, "eval_samples_per_second": 220.193, "eval_steps_per_second": 6.887, "step": 82000 }, { "epoch": 24.79711451758341, "grad_norm": 1.9301068782806396, "learning_rate": 1.75e-05, "loss": 0.4628, "step": 82500 }, { "epoch": 24.79711451758341, "eval_accuracy": 0.8838178127874792, "eval_loss": 0.653892457485199, "eval_runtime": 116.9788, "eval_samples_per_second": 203.883, "eval_steps_per_second": 6.377, "step": 82500 }, { "epoch": 24.947400060114216, "grad_norm": 1.6002157926559448, "learning_rate": 1.7000000000000003e-05, "loss": 0.4641, "step": 83000 }, { "epoch": 24.947400060114216, "eval_accuracy": 0.8832907869691026, "eval_loss": 0.6545633673667908, "eval_runtime": 116.6411, "eval_samples_per_second": 204.473, "eval_steps_per_second": 6.396, "step": 83000 }, { "epoch": 25.097685602645026, "grad_norm": 1.8524489402770996, "learning_rate": 1.65e-05, "loss": 0.4627, "step": 83500 }, { "epoch": 25.097685602645026, "eval_accuracy": 0.88376949382746, "eval_loss": 0.6472727656364441, "eval_runtime": 106.8617, "eval_samples_per_second": 223.186, "eval_steps_per_second": 6.981, "step": 83500 }, { "epoch": 25.247971145175836, "grad_norm": 2.1408944129943848, "learning_rate": 1.6000000000000003e-05, "loss": 0.4633, "step": 84000 }, { "epoch": 25.247971145175836, "eval_accuracy": 0.8841161970722239, "eval_loss": 0.6409853100776672, "eval_runtime": 115.3276, "eval_samples_per_second": 206.802, "eval_steps_per_second": 6.469, "step": 84000 }, { "epoch": 25.398256687706642, "grad_norm": 1.9788012504577637, "learning_rate": 1.55e-05, "loss": 0.4649, "step": 84500 }, { "epoch": 25.398256687706642, "eval_accuracy": 0.8843287933961408, "eval_loss": 0.6448610424995422, "eval_runtime": 115.1291, "eval_samples_per_second": 207.159, "eval_steps_per_second": 6.48, "step": 84500 }, { "epoch": 25.54854223023745, "grad_norm": 1.6495180130004883, "learning_rate": 1.5e-05, "loss": 0.4643, "step": 85000 }, { "epoch": 25.54854223023745, "eval_accuracy": 0.8839629183812031, "eval_loss": 0.6554648876190186, "eval_runtime": 116.9775, "eval_samples_per_second": 203.885, "eval_steps_per_second": 6.377, "step": 85000 }, { "epoch": 25.69882777276826, "grad_norm": 1.6574490070343018, "learning_rate": 1.45e-05, "loss": 0.4558, "step": 85500 }, { "epoch": 25.69882777276826, "eval_accuracy": 0.8844802404246292, "eval_loss": 0.6503042578697205, "eval_runtime": 117.1937, "eval_samples_per_second": 203.509, "eval_steps_per_second": 6.366, "step": 85500 }, { "epoch": 25.849113315299068, "grad_norm": 1.5653854608535767, "learning_rate": 1.4000000000000001e-05, "loss": 0.4592, "step": 86000 }, { "epoch": 25.849113315299068, "eval_accuracy": 0.8852508420922508, "eval_loss": 0.6405230760574341, "eval_runtime": 116.7895, "eval_samples_per_second": 204.214, "eval_steps_per_second": 6.388, "step": 86000 }, { "epoch": 25.999398857829878, "grad_norm": 1.6910933256149292, "learning_rate": 1.3500000000000001e-05, "loss": 0.4606, "step": 86500 }, { "epoch": 25.999398857829878, "eval_accuracy": 0.8850751610304982, "eval_loss": 0.6411125659942627, "eval_runtime": 116.7251, "eval_samples_per_second": 204.326, "eval_steps_per_second": 6.391, "step": 86500 }, { "epoch": 26.149684400360684, "grad_norm": 1.7388029098510742, "learning_rate": 1.3000000000000001e-05, "loss": 0.4532, "step": 87000 }, { "epoch": 26.149684400360684, "eval_accuracy": 0.8850505092980453, "eval_loss": 0.6419905424118042, "eval_runtime": 116.6174, "eval_samples_per_second": 204.515, "eval_steps_per_second": 6.397, "step": 87000 }, { "epoch": 26.299969942891494, "grad_norm": 2.105604410171509, "learning_rate": 1.25e-05, "loss": 0.4569, "step": 87500 }, { "epoch": 26.299969942891494, "eval_accuracy": 0.8855862014353842, "eval_loss": 0.6394108533859253, "eval_runtime": 117.1915, "eval_samples_per_second": 203.513, "eval_steps_per_second": 6.366, "step": 87500 }, { "epoch": 26.450255485422304, "grad_norm": 1.6996549367904663, "learning_rate": 1.2e-05, "loss": 0.4548, "step": 88000 }, { "epoch": 26.450255485422304, "eval_accuracy": 0.8852049349201347, "eval_loss": 0.6400984525680542, "eval_runtime": 116.8406, "eval_samples_per_second": 204.124, "eval_steps_per_second": 6.385, "step": 88000 }, { "epoch": 26.60054102795311, "grad_norm": 1.6023077964782715, "learning_rate": 1.1500000000000002e-05, "loss": 0.4553, "step": 88500 }, { "epoch": 26.60054102795311, "eval_accuracy": 0.8851541249486304, "eval_loss": 0.6371135711669922, "eval_runtime": 115.3561, "eval_samples_per_second": 206.751, "eval_steps_per_second": 6.467, "step": 88500 }, { "epoch": 26.75082657048392, "grad_norm": 1.9131189584732056, "learning_rate": 1.1000000000000001e-05, "loss": 0.4577, "step": 89000 }, { "epoch": 26.75082657048392, "eval_accuracy": 0.8858833262505433, "eval_loss": 0.6426717638969421, "eval_runtime": 115.1789, "eval_samples_per_second": 207.069, "eval_steps_per_second": 6.477, "step": 89000 }, { "epoch": 26.90111211301473, "grad_norm": 2.2094035148620605, "learning_rate": 1.05e-05, "loss": 0.4503, "step": 89500 }, { "epoch": 26.90111211301473, "eval_accuracy": 0.8854443570124794, "eval_loss": 0.6391221880912781, "eval_runtime": 115.0687, "eval_samples_per_second": 207.267, "eval_steps_per_second": 6.483, "step": 89500 }, { "epoch": 27.051397655545536, "grad_norm": 1.8893609046936035, "learning_rate": 1e-05, "loss": 0.4553, "step": 90000 }, { "epoch": 27.051397655545536, "eval_accuracy": 0.8858300999459534, "eval_loss": 0.6327862739562988, "eval_runtime": 116.8271, "eval_samples_per_second": 204.148, "eval_steps_per_second": 6.386, "step": 90000 }, { "epoch": 27.201683198076346, "grad_norm": 1.808430552482605, "learning_rate": 9.5e-06, "loss": 0.4509, "step": 90500 }, { "epoch": 27.201683198076346, "eval_accuracy": 0.8861481086840703, "eval_loss": 0.6394175887107849, "eval_runtime": 116.7842, "eval_samples_per_second": 204.223, "eval_steps_per_second": 6.388, "step": 90500 }, { "epoch": 27.351968740607152, "grad_norm": 1.8101272583007812, "learning_rate": 9e-06, "loss": 0.4515, "step": 91000 }, { "epoch": 27.351968740607152, "eval_accuracy": 0.8860981690783971, "eval_loss": 0.6386463046073914, "eval_runtime": 107.7258, "eval_samples_per_second": 221.395, "eval_steps_per_second": 6.925, "step": 91000 }, { "epoch": 27.50225428313796, "grad_norm": 1.9673771858215332, "learning_rate": 8.500000000000002e-06, "loss": 0.4516, "step": 91500 }, { "epoch": 27.50225428313796, "eval_accuracy": 0.8861981048645905, "eval_loss": 0.6380175948143005, "eval_runtime": 117.0597, "eval_samples_per_second": 203.742, "eval_steps_per_second": 6.373, "step": 91500 }, { "epoch": 27.65253982566877, "grad_norm": 1.8094542026519775, "learning_rate": 8.000000000000001e-06, "loss": 0.453, "step": 92000 }, { "epoch": 27.65253982566877, "eval_accuracy": 0.8862071138631781, "eval_loss": 0.6379066109657288, "eval_runtime": 117.0411, "eval_samples_per_second": 203.775, "eval_steps_per_second": 6.374, "step": 92000 }, { "epoch": 27.802825368199578, "grad_norm": 1.9429540634155273, "learning_rate": 7.5e-06, "loss": 0.4511, "step": 92500 }, { "epoch": 27.802825368199578, "eval_accuracy": 0.886301240407379, "eval_loss": 0.6318175196647644, "eval_runtime": 116.8886, "eval_samples_per_second": 204.04, "eval_steps_per_second": 6.382, "step": 92500 }, { "epoch": 27.953110910730388, "grad_norm": 1.730148196220398, "learning_rate": 7.000000000000001e-06, "loss": 0.4495, "step": 93000 }, { "epoch": 27.953110910730388, "eval_accuracy": 0.8861853396429603, "eval_loss": 0.6367453932762146, "eval_runtime": 111.8666, "eval_samples_per_second": 213.2, "eval_steps_per_second": 6.669, "step": 93000 }, { "epoch": 28.103396453261198, "grad_norm": 2.3028879165649414, "learning_rate": 6.5000000000000004e-06, "loss": 0.4465, "step": 93500 }, { "epoch": 28.103396453261198, "eval_accuracy": 0.8867493106047691, "eval_loss": 0.6252174377441406, "eval_runtime": 115.1966, "eval_samples_per_second": 207.037, "eval_steps_per_second": 6.476, "step": 93500 }, { "epoch": 28.253681995792004, "grad_norm": 1.4675511121749878, "learning_rate": 6e-06, "loss": 0.4496, "step": 94000 }, { "epoch": 28.253681995792004, "eval_accuracy": 0.8868322406828231, "eval_loss": 0.6325846910476685, "eval_runtime": 114.9231, "eval_samples_per_second": 207.53, "eval_steps_per_second": 6.491, "step": 94000 }, { "epoch": 28.403967538322814, "grad_norm": 1.7770355939865112, "learning_rate": 5.500000000000001e-06, "loss": 0.4502, "step": 94500 }, { "epoch": 28.403967538322814, "eval_accuracy": 0.8860397801677582, "eval_loss": 0.6367518305778503, "eval_runtime": 114.5513, "eval_samples_per_second": 208.204, "eval_steps_per_second": 6.512, "step": 94500 }, { "epoch": 28.554253080853623, "grad_norm": 1.825067162513733, "learning_rate": 5e-06, "loss": 0.4462, "step": 95000 }, { "epoch": 28.554253080853623, "eval_accuracy": 0.8870841837126774, "eval_loss": 0.6292791366577148, "eval_runtime": 116.782, "eval_samples_per_second": 204.227, "eval_steps_per_second": 6.388, "step": 95000 }, { "epoch": 28.70453862338443, "grad_norm": 1.7869678735733032, "learning_rate": 4.5e-06, "loss": 0.4447, "step": 95500 }, { "epoch": 28.70453862338443, "eval_accuracy": 0.8866019747353604, "eval_loss": 0.6365008354187012, "eval_runtime": 116.7576, "eval_samples_per_second": 204.269, "eval_steps_per_second": 6.389, "step": 95500 }, { "epoch": 28.85482416591524, "grad_norm": 1.7879579067230225, "learning_rate": 4.000000000000001e-06, "loss": 0.4504, "step": 96000 }, { "epoch": 28.85482416591524, "eval_accuracy": 0.8864061843379898, "eval_loss": 0.6379310488700867, "eval_runtime": 108.1048, "eval_samples_per_second": 220.619, "eval_steps_per_second": 6.901, "step": 96000 }, { "epoch": 29.005109708446046, "grad_norm": 1.7637947797775269, "learning_rate": 3.5000000000000004e-06, "loss": 0.4457, "step": 96500 }, { "epoch": 29.005109708446046, "eval_accuracy": 0.8863260944970128, "eval_loss": 0.6372503638267517, "eval_runtime": 113.106, "eval_samples_per_second": 210.864, "eval_steps_per_second": 6.596, "step": 96500 }, { "epoch": 29.155395250976856, "grad_norm": 1.7533965110778809, "learning_rate": 3e-06, "loss": 0.4461, "step": 97000 }, { "epoch": 29.155395250976856, "eval_accuracy": 0.8875985081861223, "eval_loss": 0.6299419403076172, "eval_runtime": 108.2845, "eval_samples_per_second": 220.253, "eval_steps_per_second": 6.889, "step": 97000 }, { "epoch": 29.305680793507666, "grad_norm": 1.9918407201766968, "learning_rate": 2.5e-06, "loss": 0.4466, "step": 97500 }, { "epoch": 29.305680793507666, "eval_accuracy": 0.8872969026134099, "eval_loss": 0.6275209188461304, "eval_runtime": 107.5426, "eval_samples_per_second": 221.773, "eval_steps_per_second": 6.937, "step": 97500 }, { "epoch": 29.455966336038472, "grad_norm": 1.583164095878601, "learning_rate": 2.0000000000000003e-06, "loss": 0.4437, "step": 98000 }, { "epoch": 29.455966336038472, "eval_accuracy": 0.8874573270650793, "eval_loss": 0.6282259821891785, "eval_runtime": 117.0909, "eval_samples_per_second": 203.688, "eval_steps_per_second": 6.371, "step": 98000 }, { "epoch": 29.60625187856928, "grad_norm": 1.759700059890747, "learning_rate": 1.5e-06, "loss": 0.4437, "step": 98500 }, { "epoch": 29.60625187856928, "eval_accuracy": 0.8873068409470514, "eval_loss": 0.632835328578949, "eval_runtime": 116.6535, "eval_samples_per_second": 204.452, "eval_steps_per_second": 6.395, "step": 98500 }, { "epoch": 29.75653742110009, "grad_norm": 1.6183301210403442, "learning_rate": 1.0000000000000002e-06, "loss": 0.4443, "step": 99000 }, { "epoch": 29.75653742110009, "eval_accuracy": 0.8876918497812977, "eval_loss": 0.624975860118866, "eval_runtime": 116.8629, "eval_samples_per_second": 204.085, "eval_steps_per_second": 6.384, "step": 99000 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 31, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.346992290195046e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }