|
{ |
|
"best_metric": 0.8528650403022766, |
|
"best_model_checkpoint": "./model_fine-tune/glot/mbert/dan-Latn/checkpoint-98500", |
|
"epoch": 10.286131996658312, |
|
"eval_steps": 500, |
|
"global_step": 98500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.052213868003341685, |
|
"grad_norm": 4.64046049118042, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.7666, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.052213868003341685, |
|
"eval_accuracy": 0.6961985245587479, |
|
"eval_loss": 1.741060733795166, |
|
"eval_runtime": 361.3155, |
|
"eval_samples_per_second": 175.589, |
|
"eval_steps_per_second": 5.488, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10442773600668337, |
|
"grad_norm": 3.5290920734405518, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.6122, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10442773600668337, |
|
"eval_accuracy": 0.7084748820521898, |
|
"eval_loss": 1.6247525215148926, |
|
"eval_runtime": 360.2132, |
|
"eval_samples_per_second": 176.126, |
|
"eval_steps_per_second": 5.505, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15664160401002505, |
|
"grad_norm": 3.951317071914673, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.5487, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15664160401002505, |
|
"eval_accuracy": 0.718550082660156, |
|
"eval_loss": 1.553884506225586, |
|
"eval_runtime": 361.2942, |
|
"eval_samples_per_second": 175.599, |
|
"eval_steps_per_second": 5.489, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.20885547201336674, |
|
"grad_norm": 3.799274444580078, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.4946, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20885547201336674, |
|
"eval_accuracy": 0.723134633812131, |
|
"eval_loss": 1.5116215944290161, |
|
"eval_runtime": 360.3813, |
|
"eval_samples_per_second": 176.044, |
|
"eval_steps_per_second": 5.503, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.26106934001670845, |
|
"grad_norm": 3.499929428100586, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.4573, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.26106934001670845, |
|
"eval_accuracy": 0.7287729394802086, |
|
"eval_loss": 1.4748759269714355, |
|
"eval_runtime": 362.14, |
|
"eval_samples_per_second": 175.189, |
|
"eval_steps_per_second": 5.476, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3132832080200501, |
|
"grad_norm": 3.6163783073425293, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.4332, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3132832080200501, |
|
"eval_accuracy": 0.7334675822291515, |
|
"eval_loss": 1.4382221698760986, |
|
"eval_runtime": 362.0967, |
|
"eval_samples_per_second": 175.21, |
|
"eval_steps_per_second": 5.476, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3654970760233918, |
|
"grad_norm": 3.382498264312744, |
|
"learning_rate": 9.65e-05, |
|
"loss": 1.3914, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3654970760233918, |
|
"eval_accuracy": 0.7367988898749408, |
|
"eval_loss": 1.4078316688537598, |
|
"eval_runtime": 362.0182, |
|
"eval_samples_per_second": 175.248, |
|
"eval_steps_per_second": 5.478, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4177109440267335, |
|
"grad_norm": 3.2594001293182373, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.3843, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4177109440267335, |
|
"eval_accuracy": 0.7404781561454725, |
|
"eval_loss": 1.37874174118042, |
|
"eval_runtime": 362.2257, |
|
"eval_samples_per_second": 175.148, |
|
"eval_steps_per_second": 5.474, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4699248120300752, |
|
"grad_norm": 3.717684268951416, |
|
"learning_rate": 9.55e-05, |
|
"loss": 1.3628, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4699248120300752, |
|
"eval_accuracy": 0.7430362394212798, |
|
"eval_loss": 1.373829960823059, |
|
"eval_runtime": 362.0885, |
|
"eval_samples_per_second": 175.214, |
|
"eval_steps_per_second": 5.477, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5221386800334169, |
|
"grad_norm": 3.67509126663208, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.356, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5221386800334169, |
|
"eval_accuracy": 0.7465132147816155, |
|
"eval_loss": 1.343856930732727, |
|
"eval_runtime": 362.013, |
|
"eval_samples_per_second": 175.251, |
|
"eval_steps_per_second": 5.478, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5743525480367586, |
|
"grad_norm": 2.995349407196045, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 1.3275, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5743525480367586, |
|
"eval_accuracy": 0.747906382026035, |
|
"eval_loss": 1.3185055255889893, |
|
"eval_runtime": 363.6082, |
|
"eval_samples_per_second": 174.482, |
|
"eval_steps_per_second": 5.454, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6265664160401002, |
|
"grad_norm": 3.0733938217163086, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.3192, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6265664160401002, |
|
"eval_accuracy": 0.7498967127579244, |
|
"eval_loss": 1.3267806768417358, |
|
"eval_runtime": 362.4596, |
|
"eval_samples_per_second": 175.035, |
|
"eval_steps_per_second": 5.471, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6787802840434419, |
|
"grad_norm": 3.0954840183258057, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 1.3123, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6787802840434419, |
|
"eval_accuracy": 0.7517548935891929, |
|
"eval_loss": 1.3006523847579956, |
|
"eval_runtime": 363.2093, |
|
"eval_samples_per_second": 174.673, |
|
"eval_steps_per_second": 5.46, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7309941520467836, |
|
"grad_norm": 3.1289381980895996, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 1.2942, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7309941520467836, |
|
"eval_accuracy": 0.7542296938604579, |
|
"eval_loss": 1.2929292917251587, |
|
"eval_runtime": 362.9614, |
|
"eval_samples_per_second": 174.793, |
|
"eval_steps_per_second": 5.463, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7832080200501254, |
|
"grad_norm": 3.440656900405884, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 1.2812, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7832080200501254, |
|
"eval_accuracy": 0.756235706841684, |
|
"eval_loss": 1.2668538093566895, |
|
"eval_runtime": 362.1254, |
|
"eval_samples_per_second": 175.196, |
|
"eval_steps_per_second": 5.476, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.835421888053467, |
|
"grad_norm": 4.0889458656311035, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.2708, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.835421888053467, |
|
"eval_accuracy": 0.7571290622439745, |
|
"eval_loss": 1.283825159072876, |
|
"eval_runtime": 362.2084, |
|
"eval_samples_per_second": 175.156, |
|
"eval_steps_per_second": 5.475, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8876357560568087, |
|
"grad_norm": 2.77811598777771, |
|
"learning_rate": 9.15e-05, |
|
"loss": 1.2759, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8876357560568087, |
|
"eval_accuracy": 0.7586107082301347, |
|
"eval_loss": 1.2615000009536743, |
|
"eval_runtime": 362.3765, |
|
"eval_samples_per_second": 175.075, |
|
"eval_steps_per_second": 5.472, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9398496240601504, |
|
"grad_norm": 3.350158929824829, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.257, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9398496240601504, |
|
"eval_accuracy": 0.7595859575845869, |
|
"eval_loss": 1.250738501548767, |
|
"eval_runtime": 362.5674, |
|
"eval_samples_per_second": 174.983, |
|
"eval_steps_per_second": 5.469, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9920634920634921, |
|
"grad_norm": 3.2566370964050293, |
|
"learning_rate": 9.05e-05, |
|
"loss": 1.2437, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.9920634920634921, |
|
"eval_accuracy": 0.7619539601093824, |
|
"eval_loss": 1.248534083366394, |
|
"eval_runtime": 362.5746, |
|
"eval_samples_per_second": 174.979, |
|
"eval_steps_per_second": 5.469, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.0442773600668338, |
|
"grad_norm": 3.279320001602173, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2333, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0442773600668338, |
|
"eval_accuracy": 0.762933591347759, |
|
"eval_loss": 1.2348397970199585, |
|
"eval_runtime": 362.8313, |
|
"eval_samples_per_second": 174.855, |
|
"eval_steps_per_second": 5.465, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0964912280701755, |
|
"grad_norm": 3.002204418182373, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 1.2204, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.0964912280701755, |
|
"eval_accuracy": 0.7645752968362766, |
|
"eval_loss": 1.2361088991165161, |
|
"eval_runtime": 362.7427, |
|
"eval_samples_per_second": 174.898, |
|
"eval_steps_per_second": 5.467, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.1487050960735172, |
|
"grad_norm": 4.058037281036377, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.2116, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.1487050960735172, |
|
"eval_accuracy": 0.7654839256801539, |
|
"eval_loss": 1.2149455547332764, |
|
"eval_runtime": 362.0743, |
|
"eval_samples_per_second": 175.221, |
|
"eval_steps_per_second": 5.477, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.2009189640768587, |
|
"grad_norm": 3.364161968231201, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 1.2083, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2009189640768587, |
|
"eval_accuracy": 0.7664035053683952, |
|
"eval_loss": 1.2015998363494873, |
|
"eval_runtime": 362.482, |
|
"eval_samples_per_second": 175.024, |
|
"eval_steps_per_second": 5.471, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2531328320802004, |
|
"grad_norm": 2.995591402053833, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.2056, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.2531328320802004, |
|
"eval_accuracy": 0.76765140727518, |
|
"eval_loss": 1.1961411237716675, |
|
"eval_runtime": 363.557, |
|
"eval_samples_per_second": 174.506, |
|
"eval_steps_per_second": 5.454, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.3053467000835421, |
|
"grad_norm": 3.010730028152466, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.1917, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.3053467000835421, |
|
"eval_accuracy": 0.7685342108740758, |
|
"eval_loss": 1.1992570161819458, |
|
"eval_runtime": 362.5156, |
|
"eval_samples_per_second": 175.008, |
|
"eval_steps_per_second": 5.47, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.3575605680868839, |
|
"grad_norm": 3.052395820617676, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.1905, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.3575605680868839, |
|
"eval_accuracy": 0.7699597470201401, |
|
"eval_loss": 1.2011651992797852, |
|
"eval_runtime": 361.326, |
|
"eval_samples_per_second": 175.584, |
|
"eval_steps_per_second": 5.488, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.4097744360902256, |
|
"grad_norm": 2.883955717086792, |
|
"learning_rate": 8.65e-05, |
|
"loss": 1.1839, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4097744360902256, |
|
"eval_accuracy": 0.7708384570352297, |
|
"eval_loss": 1.1753507852554321, |
|
"eval_runtime": 362.9321, |
|
"eval_samples_per_second": 174.807, |
|
"eval_steps_per_second": 5.464, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4619883040935673, |
|
"grad_norm": 2.7026546001434326, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.1724, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.4619883040935673, |
|
"eval_accuracy": 0.7709277011489558, |
|
"eval_loss": 1.168317198753357, |
|
"eval_runtime": 362.8518, |
|
"eval_samples_per_second": 174.845, |
|
"eval_steps_per_second": 5.465, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.514202172096909, |
|
"grad_norm": 2.8576297760009766, |
|
"learning_rate": 8.55e-05, |
|
"loss": 1.1758, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.514202172096909, |
|
"eval_accuracy": 0.7720082438542706, |
|
"eval_loss": 1.1711252927780151, |
|
"eval_runtime": 362.6486, |
|
"eval_samples_per_second": 174.943, |
|
"eval_steps_per_second": 5.468, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.5664160401002505, |
|
"grad_norm": 3.314800500869751, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.1656, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.5664160401002505, |
|
"eval_accuracy": 0.7731014057112545, |
|
"eval_loss": NaN, |
|
"eval_runtime": 362.4489, |
|
"eval_samples_per_second": 175.04, |
|
"eval_steps_per_second": 5.471, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.6186299081035922, |
|
"grad_norm": 2.740029811859131, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 1.1659, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.6186299081035922, |
|
"eval_accuracy": 0.7745773535784676, |
|
"eval_loss": 1.1655621528625488, |
|
"eval_runtime": 362.0759, |
|
"eval_samples_per_second": 175.22, |
|
"eval_steps_per_second": 5.477, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.670843776106934, |
|
"grad_norm": 2.9377198219299316, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.1512, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.670843776106934, |
|
"eval_accuracy": 0.7745256599790964, |
|
"eval_loss": 1.1678109169006348, |
|
"eval_runtime": 362.3938, |
|
"eval_samples_per_second": 175.066, |
|
"eval_steps_per_second": 5.472, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.7230576441102756, |
|
"grad_norm": 3.2804691791534424, |
|
"learning_rate": 8.35e-05, |
|
"loss": 1.1394, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.7230576441102756, |
|
"eval_accuracy": 0.7751363378115868, |
|
"eval_loss": 1.1540894508361816, |
|
"eval_runtime": 362.0672, |
|
"eval_samples_per_second": 175.224, |
|
"eval_steps_per_second": 5.477, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.7752715121136173, |
|
"grad_norm": 3.486011028289795, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.1382, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.7752715121136173, |
|
"eval_accuracy": 0.7765838022142181, |
|
"eval_loss": 1.1509926319122314, |
|
"eval_runtime": 362.2521, |
|
"eval_samples_per_second": 175.135, |
|
"eval_steps_per_second": 5.474, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.827485380116959, |
|
"grad_norm": 2.56192684173584, |
|
"learning_rate": 8.25e-05, |
|
"loss": 1.1448, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.827485380116959, |
|
"eval_accuracy": 0.7774433499499322, |
|
"eval_loss": 1.150856375694275, |
|
"eval_runtime": 361.437, |
|
"eval_samples_per_second": 175.53, |
|
"eval_steps_per_second": 5.486, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.8796992481203008, |
|
"grad_norm": 2.9113447666168213, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.1376, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.8796992481203008, |
|
"eval_accuracy": 0.7779668095978695, |
|
"eval_loss": 1.1359950304031372, |
|
"eval_runtime": 362.1516, |
|
"eval_samples_per_second": 175.184, |
|
"eval_steps_per_second": 5.476, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.9319131161236425, |
|
"grad_norm": 2.8685195446014404, |
|
"learning_rate": 8.15e-05, |
|
"loss": 1.1306, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9319131161236425, |
|
"eval_accuracy": 0.7786356827454954, |
|
"eval_loss": 1.1356443166732788, |
|
"eval_runtime": 367.0737, |
|
"eval_samples_per_second": 172.835, |
|
"eval_steps_per_second": 5.402, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 3.4618313312530518, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.1379, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"eval_accuracy": 0.7796485696787286, |
|
"eval_loss": 1.1357932090759277, |
|
"eval_runtime": 367.2471, |
|
"eval_samples_per_second": 172.753, |
|
"eval_steps_per_second": 5.4, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.036340852130326, |
|
"grad_norm": 3.1010448932647705, |
|
"learning_rate": 8.05e-05, |
|
"loss": 1.109, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.036340852130326, |
|
"eval_accuracy": 0.7807163576099158, |
|
"eval_loss": 1.1260879039764404, |
|
"eval_runtime": 367.0489, |
|
"eval_samples_per_second": 172.846, |
|
"eval_steps_per_second": 5.403, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0885547201336676, |
|
"grad_norm": 3.1995084285736084, |
|
"learning_rate": 8e-05, |
|
"loss": 1.1047, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.0885547201336676, |
|
"eval_accuracy": 0.7806008208598338, |
|
"eval_loss": 1.1227824687957764, |
|
"eval_runtime": 367.4627, |
|
"eval_samples_per_second": 172.652, |
|
"eval_steps_per_second": 5.396, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.1407685881370093, |
|
"grad_norm": 2.741629123687744, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 1.1042, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.1407685881370093, |
|
"eval_accuracy": 0.7819552122122398, |
|
"eval_loss": 1.1106891632080078, |
|
"eval_runtime": 369.2375, |
|
"eval_samples_per_second": 171.822, |
|
"eval_steps_per_second": 5.371, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.192982456140351, |
|
"grad_norm": 3.0432229042053223, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.1071, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.192982456140351, |
|
"eval_accuracy": 0.7825761226202902, |
|
"eval_loss": 1.105894923210144, |
|
"eval_runtime": 368.3863, |
|
"eval_samples_per_second": 172.219, |
|
"eval_steps_per_second": 5.383, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.2451963241436927, |
|
"grad_norm": 2.60355806350708, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 1.1, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2451963241436927, |
|
"eval_accuracy": 0.7824829036453502, |
|
"eval_loss": 1.1067482233047485, |
|
"eval_runtime": 369.3294, |
|
"eval_samples_per_second": 171.779, |
|
"eval_steps_per_second": 5.369, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2974101921470345, |
|
"grad_norm": 3.290499448776245, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.094, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.2974101921470345, |
|
"eval_accuracy": 0.7835772317290801, |
|
"eval_loss": 1.1064658164978027, |
|
"eval_runtime": 371.2345, |
|
"eval_samples_per_second": 170.897, |
|
"eval_steps_per_second": 5.342, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.3496240601503757, |
|
"grad_norm": 2.749668598175049, |
|
"learning_rate": 7.75e-05, |
|
"loss": 1.0868, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.3496240601503757, |
|
"eval_accuracy": 0.7844280402341692, |
|
"eval_loss": 1.1136114597320557, |
|
"eval_runtime": 370.2159, |
|
"eval_samples_per_second": 171.368, |
|
"eval_steps_per_second": 5.356, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.4018379281537174, |
|
"grad_norm": 2.553410768508911, |
|
"learning_rate": 7.7e-05, |
|
"loss": 1.0938, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.4018379281537174, |
|
"eval_accuracy": 0.7854044343476029, |
|
"eval_loss": 1.0965579748153687, |
|
"eval_runtime": 370.6834, |
|
"eval_samples_per_second": 171.151, |
|
"eval_steps_per_second": 5.35, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.454051796157059, |
|
"grad_norm": 2.7348687648773193, |
|
"learning_rate": 7.65e-05, |
|
"loss": 1.0881, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.454051796157059, |
|
"eval_accuracy": 0.7850364727467058, |
|
"eval_loss": 1.1107020378112793, |
|
"eval_runtime": 369.6648, |
|
"eval_samples_per_second": 171.623, |
|
"eval_steps_per_second": 5.364, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.506265664160401, |
|
"grad_norm": 2.639407157897949, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.0885, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.506265664160401, |
|
"eval_accuracy": 0.7867807742632953, |
|
"eval_loss": 1.0890835523605347, |
|
"eval_runtime": 369.0693, |
|
"eval_samples_per_second": 171.9, |
|
"eval_steps_per_second": 5.373, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.5584795321637426, |
|
"grad_norm": 3.1823441982269287, |
|
"learning_rate": 7.55e-05, |
|
"loss": 1.077, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.5584795321637426, |
|
"eval_accuracy": 0.7863467916817448, |
|
"eval_loss": 1.0876108407974243, |
|
"eval_runtime": 368.1359, |
|
"eval_samples_per_second": 172.336, |
|
"eval_steps_per_second": 5.387, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.6106934001670843, |
|
"grad_norm": 2.6731505393981934, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.078, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.6106934001670843, |
|
"eval_accuracy": 0.7872558672911878, |
|
"eval_loss": 1.0856846570968628, |
|
"eval_runtime": 367.5868, |
|
"eval_samples_per_second": 172.593, |
|
"eval_steps_per_second": 5.395, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.662907268170426, |
|
"grad_norm": 2.8799052238464355, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 1.0759, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.662907268170426, |
|
"eval_accuracy": 0.788363306685935, |
|
"eval_loss": 1.080370545387268, |
|
"eval_runtime": 368.0395, |
|
"eval_samples_per_second": 172.381, |
|
"eval_steps_per_second": 5.388, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.7151211361737677, |
|
"grad_norm": 4.7506208419799805, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.072, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.7151211361737677, |
|
"eval_accuracy": 0.7885482778687157, |
|
"eval_loss": 1.08516526222229, |
|
"eval_runtime": 366.0953, |
|
"eval_samples_per_second": 173.296, |
|
"eval_steps_per_second": 5.417, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.7673350041771094, |
|
"grad_norm": 2.411212682723999, |
|
"learning_rate": 7.35e-05, |
|
"loss": 1.0655, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.7673350041771094, |
|
"eval_accuracy": 0.7889261779231521, |
|
"eval_loss": 1.0773206949234009, |
|
"eval_runtime": 366.4448, |
|
"eval_samples_per_second": 173.131, |
|
"eval_steps_per_second": 5.411, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.819548872180451, |
|
"grad_norm": 2.6230037212371826, |
|
"learning_rate": 7.3e-05, |
|
"loss": 1.0747, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.819548872180451, |
|
"eval_accuracy": 0.7891164707308205, |
|
"eval_loss": 1.0783028602600098, |
|
"eval_runtime": 367.8248, |
|
"eval_samples_per_second": 172.482, |
|
"eval_steps_per_second": 5.391, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.871762740183793, |
|
"grad_norm": 2.9123287200927734, |
|
"learning_rate": 7.25e-05, |
|
"loss": 1.0691, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.871762740183793, |
|
"eval_accuracy": 0.7904697432780537, |
|
"eval_loss": 1.0683778524398804, |
|
"eval_runtime": 365.814, |
|
"eval_samples_per_second": 173.43, |
|
"eval_steps_per_second": 5.421, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.9239766081871346, |
|
"grad_norm": 2.716794967651367, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.0581, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.9239766081871346, |
|
"eval_accuracy": 0.7907833141699024, |
|
"eval_loss": 1.0698238611221313, |
|
"eval_runtime": 368.4835, |
|
"eval_samples_per_second": 172.173, |
|
"eval_steps_per_second": 5.382, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 2.900315761566162, |
|
"learning_rate": 7.15e-05, |
|
"loss": 1.0576, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"eval_accuracy": 0.7912189924393805, |
|
"eval_loss": 1.0527312755584717, |
|
"eval_runtime": 367.3342, |
|
"eval_samples_per_second": 172.712, |
|
"eval_steps_per_second": 5.398, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.028404344193818, |
|
"grad_norm": 2.733102321624756, |
|
"learning_rate": 7.1e-05, |
|
"loss": 1.0491, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.028404344193818, |
|
"eval_accuracy": 0.7912027755966868, |
|
"eval_loss": 1.0649120807647705, |
|
"eval_runtime": 367.5821, |
|
"eval_samples_per_second": 172.595, |
|
"eval_steps_per_second": 5.395, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.0806182121971597, |
|
"grad_norm": 2.990572452545166, |
|
"learning_rate": 7.05e-05, |
|
"loss": 1.0429, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.0806182121971597, |
|
"eval_accuracy": 0.791866931892446, |
|
"eval_loss": 1.065590262413025, |
|
"eval_runtime": 367.6583, |
|
"eval_samples_per_second": 172.56, |
|
"eval_steps_per_second": 5.394, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.1328320802005014, |
|
"grad_norm": 2.6242377758026123, |
|
"learning_rate": 7e-05, |
|
"loss": 1.0416, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.1328320802005014, |
|
"eval_accuracy": 0.793420728495659, |
|
"eval_loss": 1.056522250175476, |
|
"eval_runtime": 367.1286, |
|
"eval_samples_per_second": 172.809, |
|
"eval_steps_per_second": 5.401, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.185045948203843, |
|
"grad_norm": 2.6089892387390137, |
|
"learning_rate": 6.95e-05, |
|
"loss": 1.0388, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.185045948203843, |
|
"eval_accuracy": 0.792953820707681, |
|
"eval_loss": 1.0602775812149048, |
|
"eval_runtime": 367.5358, |
|
"eval_samples_per_second": 172.617, |
|
"eval_steps_per_second": 5.395, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.2372598162071844, |
|
"grad_norm": 2.6940767765045166, |
|
"learning_rate": 6.9e-05, |
|
"loss": 1.0351, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.2372598162071844, |
|
"eval_accuracy": 0.7933434972497504, |
|
"eval_loss": 1.0466923713684082, |
|
"eval_runtime": 367.3002, |
|
"eval_samples_per_second": 172.728, |
|
"eval_steps_per_second": 5.399, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"grad_norm": 2.5024611949920654, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 1.0364, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"eval_accuracy": 0.7938457165216277, |
|
"eval_loss": 1.0515408515930176, |
|
"eval_runtime": 367.3309, |
|
"eval_samples_per_second": 172.713, |
|
"eval_steps_per_second": 5.398, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.341687552213868, |
|
"grad_norm": 2.594076156616211, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.0352, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.341687552213868, |
|
"eval_accuracy": 0.7949145678709858, |
|
"eval_loss": 1.0443620681762695, |
|
"eval_runtime": 367.752, |
|
"eval_samples_per_second": 172.516, |
|
"eval_steps_per_second": 5.392, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.3939014202172095, |
|
"grad_norm": 4.887377738952637, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 1.0269, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.3939014202172095, |
|
"eval_accuracy": 0.7945826018068661, |
|
"eval_loss": 1.0488784313201904, |
|
"eval_runtime": 368.3885, |
|
"eval_samples_per_second": 172.218, |
|
"eval_steps_per_second": 5.383, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.4461152882205512, |
|
"grad_norm": 3.1437246799468994, |
|
"learning_rate": 6.7e-05, |
|
"loss": 1.0293, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.4461152882205512, |
|
"eval_accuracy": 0.795481144785727, |
|
"eval_loss": 1.0364991426467896, |
|
"eval_runtime": 366.6755, |
|
"eval_samples_per_second": 173.022, |
|
"eval_steps_per_second": 5.408, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.498329156223893, |
|
"grad_norm": 2.490694761276245, |
|
"learning_rate": 6.65e-05, |
|
"loss": 1.0179, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.498329156223893, |
|
"eval_accuracy": 0.7964686907750312, |
|
"eval_loss": 1.043065071105957, |
|
"eval_runtime": 367.4767, |
|
"eval_samples_per_second": 172.645, |
|
"eval_steps_per_second": 5.396, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.5505430242272347, |
|
"grad_norm": 2.9056098461151123, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.0147, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.5505430242272347, |
|
"eval_accuracy": 0.7962407550483477, |
|
"eval_loss": NaN, |
|
"eval_runtime": 367.3507, |
|
"eval_samples_per_second": 172.704, |
|
"eval_steps_per_second": 5.398, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.6027568922305764, |
|
"grad_norm": 2.655351161956787, |
|
"learning_rate": 6.55e-05, |
|
"loss": 1.0112, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.6027568922305764, |
|
"eval_accuracy": 0.7970424400295055, |
|
"eval_loss": 1.033272624015808, |
|
"eval_runtime": 367.4604, |
|
"eval_samples_per_second": 172.653, |
|
"eval_steps_per_second": 5.397, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.654970760233918, |
|
"grad_norm": 2.922175168991089, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.0145, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.654970760233918, |
|
"eval_accuracy": 0.7967277004610398, |
|
"eval_loss": 1.0407856702804565, |
|
"eval_runtime": 367.6311, |
|
"eval_samples_per_second": 172.572, |
|
"eval_steps_per_second": 5.394, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.70718462823726, |
|
"grad_norm": 2.7691986560821533, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 1.0133, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.70718462823726, |
|
"eval_accuracy": 0.7978057552609327, |
|
"eval_loss": 1.0315524339675903, |
|
"eval_runtime": 368.5127, |
|
"eval_samples_per_second": 172.16, |
|
"eval_steps_per_second": 5.381, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"grad_norm": 2.718721389770508, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.0123, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"eval_accuracy": 0.7984869599976302, |
|
"eval_loss": 1.0284632444381714, |
|
"eval_runtime": 367.5592, |
|
"eval_samples_per_second": 172.606, |
|
"eval_steps_per_second": 5.395, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.8116123642439432, |
|
"grad_norm": 2.4457788467407227, |
|
"learning_rate": 6.35e-05, |
|
"loss": 1.008, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.8116123642439432, |
|
"eval_accuracy": 0.7985738639575599, |
|
"eval_loss": 1.0290788412094116, |
|
"eval_runtime": 367.5585, |
|
"eval_samples_per_second": 172.607, |
|
"eval_steps_per_second": 5.395, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.863826232247285, |
|
"grad_norm": 3.0719549655914307, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.9995, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.863826232247285, |
|
"eval_accuracy": 0.7990341348986376, |
|
"eval_loss": 1.0225682258605957, |
|
"eval_runtime": 367.5861, |
|
"eval_samples_per_second": 172.594, |
|
"eval_steps_per_second": 5.395, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.9160401002506267, |
|
"grad_norm": 2.502732038497925, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.0059, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.9160401002506267, |
|
"eval_accuracy": 0.7990824767003194, |
|
"eval_loss": 1.0225658416748047, |
|
"eval_runtime": 368.4364, |
|
"eval_samples_per_second": 172.195, |
|
"eval_steps_per_second": 5.382, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"grad_norm": 2.615722894668579, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.9908, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"eval_accuracy": 0.8004074602540703, |
|
"eval_loss": NaN, |
|
"eval_runtime": 367.3504, |
|
"eval_samples_per_second": 172.704, |
|
"eval_steps_per_second": 5.398, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.02046783625731, |
|
"grad_norm": 2.6223480701446533, |
|
"learning_rate": 6.15e-05, |
|
"loss": 1.0019, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.02046783625731, |
|
"eval_accuracy": 0.8002175287186218, |
|
"eval_loss": 1.0221267938613892, |
|
"eval_runtime": 367.5886, |
|
"eval_samples_per_second": 172.592, |
|
"eval_steps_per_second": 5.395, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.072681704260652, |
|
"grad_norm": 7.730719089508057, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.9859, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.072681704260652, |
|
"eval_accuracy": 0.8004946972174642, |
|
"eval_loss": 1.0225615501403809, |
|
"eval_runtime": 367.5118, |
|
"eval_samples_per_second": 172.628, |
|
"eval_steps_per_second": 5.396, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.124895572263993, |
|
"grad_norm": 2.6854565143585205, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.9899, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.124895572263993, |
|
"eval_accuracy": 0.8006603707418583, |
|
"eval_loss": 1.0182828903198242, |
|
"eval_runtime": 367.2006, |
|
"eval_samples_per_second": 172.775, |
|
"eval_steps_per_second": 5.4, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.177109440267335, |
|
"grad_norm": 2.871232509613037, |
|
"learning_rate": 6e-05, |
|
"loss": 0.9769, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.177109440267335, |
|
"eval_accuracy": 0.8012399052534226, |
|
"eval_loss": 1.007629632949829, |
|
"eval_runtime": 367.698, |
|
"eval_samples_per_second": 172.541, |
|
"eval_steps_per_second": 5.393, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.2293233082706765, |
|
"grad_norm": 2.7055728435516357, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.9862, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.2293233082706765, |
|
"eval_accuracy": 0.8023664324313652, |
|
"eval_loss": 1.0073480606079102, |
|
"eval_runtime": 367.5166, |
|
"eval_samples_per_second": 172.626, |
|
"eval_steps_per_second": 5.396, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.281537176274019, |
|
"grad_norm": 2.628458261489868, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.9784, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.281537176274019, |
|
"eval_accuracy": 0.8022199523230344, |
|
"eval_loss": 1.0004937648773193, |
|
"eval_runtime": 368.0641, |
|
"eval_samples_per_second": 172.369, |
|
"eval_steps_per_second": 5.388, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.33375104427736, |
|
"grad_norm": 2.5863466262817383, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.974, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.33375104427736, |
|
"eval_accuracy": 0.8025649357830319, |
|
"eval_loss": 1.0010262727737427, |
|
"eval_runtime": 367.7203, |
|
"eval_samples_per_second": 172.531, |
|
"eval_steps_per_second": 5.393, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.385964912280702, |
|
"grad_norm": 2.748701333999634, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.9828, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.385964912280702, |
|
"eval_accuracy": 0.8030370401871146, |
|
"eval_loss": 0.998631477355957, |
|
"eval_runtime": 367.8751, |
|
"eval_samples_per_second": 172.458, |
|
"eval_steps_per_second": 5.39, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.438178780284043, |
|
"grad_norm": 2.7004311084747314, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.9863, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.438178780284043, |
|
"eval_accuracy": 0.8032171565331311, |
|
"eval_loss": 1.0026530027389526, |
|
"eval_runtime": 367.6912, |
|
"eval_samples_per_second": 172.544, |
|
"eval_steps_per_second": 5.393, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.4903926482873855, |
|
"grad_norm": 2.4777750968933105, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.9725, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.4903926482873855, |
|
"eval_accuracy": 0.8036803447902678, |
|
"eval_loss": 0.9949015378952026, |
|
"eval_runtime": 367.6755, |
|
"eval_samples_per_second": 172.552, |
|
"eval_steps_per_second": 5.393, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.542606516290727, |
|
"grad_norm": 2.6259942054748535, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.9685, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.542606516290727, |
|
"eval_accuracy": 0.8042554828282609, |
|
"eval_loss": 0.9936387538909912, |
|
"eval_runtime": 368.1053, |
|
"eval_samples_per_second": 172.35, |
|
"eval_steps_per_second": 5.387, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.594820384294069, |
|
"grad_norm": 3.246826648712158, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.9736, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.594820384294069, |
|
"eval_accuracy": 0.8043477668789687, |
|
"eval_loss": 0.9914335608482361, |
|
"eval_runtime": 367.7482, |
|
"eval_samples_per_second": 172.518, |
|
"eval_steps_per_second": 5.392, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.64703425229741, |
|
"grad_norm": 2.689743995666504, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.9695, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.64703425229741, |
|
"eval_accuracy": 0.8044957424284033, |
|
"eval_loss": 1.0012022256851196, |
|
"eval_runtime": 367.8322, |
|
"eval_samples_per_second": 172.478, |
|
"eval_steps_per_second": 5.391, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.6992481203007515, |
|
"grad_norm": 2.6002440452575684, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.9632, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.6992481203007515, |
|
"eval_accuracy": 0.8053408897805091, |
|
"eval_loss": 0.9931978583335876, |
|
"eval_runtime": 367.8417, |
|
"eval_samples_per_second": 172.474, |
|
"eval_steps_per_second": 5.391, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.751461988304094, |
|
"grad_norm": 2.720722198486328, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.9683, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.751461988304094, |
|
"eval_accuracy": 0.8055148927829874, |
|
"eval_loss": 0.9913645386695862, |
|
"eval_runtime": 367.8141, |
|
"eval_samples_per_second": 172.487, |
|
"eval_steps_per_second": 5.391, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.803675856307435, |
|
"grad_norm": 2.6984803676605225, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.9559, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.803675856307435, |
|
"eval_accuracy": 0.8059522006843057, |
|
"eval_loss": 0.9891652464866638, |
|
"eval_runtime": 368.2454, |
|
"eval_samples_per_second": 172.285, |
|
"eval_steps_per_second": 5.385, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.855889724310777, |
|
"grad_norm": 2.4419870376586914, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.9592, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.855889724310777, |
|
"eval_accuracy": 0.805452993603716, |
|
"eval_loss": 0.9957113862037659, |
|
"eval_runtime": 368.2591, |
|
"eval_samples_per_second": 172.278, |
|
"eval_steps_per_second": 5.385, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.908103592314118, |
|
"grad_norm": 2.915653944015503, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.9593, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.908103592314118, |
|
"eval_accuracy": 0.8068829182930286, |
|
"eval_loss": 0.987449049949646, |
|
"eval_runtime": 367.9346, |
|
"eval_samples_per_second": 172.43, |
|
"eval_steps_per_second": 5.39, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.9603174603174605, |
|
"grad_norm": 2.3721494674682617, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.9521, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 4.9603174603174605, |
|
"eval_accuracy": 0.8072047578358524, |
|
"eval_loss": 0.9854400157928467, |
|
"eval_runtime": 368.1441, |
|
"eval_samples_per_second": 172.332, |
|
"eval_steps_per_second": 5.386, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 5.012531328320802, |
|
"grad_norm": 2.7081363201141357, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.9516, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.012531328320802, |
|
"eval_accuracy": 0.8074420340361171, |
|
"eval_loss": 0.9737870097160339, |
|
"eval_runtime": 367.5696, |
|
"eval_samples_per_second": 172.601, |
|
"eval_steps_per_second": 5.395, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.064745196324144, |
|
"grad_norm": 2.834805965423584, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.9361, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.064745196324144, |
|
"eval_accuracy": 0.807780889243512, |
|
"eval_loss": NaN, |
|
"eval_runtime": 366.5337, |
|
"eval_samples_per_second": 173.089, |
|
"eval_steps_per_second": 5.41, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.116959064327485, |
|
"grad_norm": 2.5488121509552, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.9472, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.116959064327485, |
|
"eval_accuracy": 0.8083244761680142, |
|
"eval_loss": 0.9714484214782715, |
|
"eval_runtime": 364.0778, |
|
"eval_samples_per_second": 174.257, |
|
"eval_steps_per_second": 5.447, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.169172932330827, |
|
"grad_norm": 2.5738179683685303, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.9387, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.169172932330827, |
|
"eval_accuracy": 0.8083237627586395, |
|
"eval_loss": NaN, |
|
"eval_runtime": 366.0302, |
|
"eval_samples_per_second": 173.327, |
|
"eval_steps_per_second": 5.418, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.221386800334169, |
|
"grad_norm": 2.578078508377075, |
|
"learning_rate": 5e-05, |
|
"loss": 0.9405, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.221386800334169, |
|
"eval_accuracy": 0.8092523232767513, |
|
"eval_loss": 0.9723265767097473, |
|
"eval_runtime": 365.9623, |
|
"eval_samples_per_second": 173.359, |
|
"eval_steps_per_second": 5.419, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.273600668337511, |
|
"grad_norm": 3.3929808139801025, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.9386, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.273600668337511, |
|
"eval_accuracy": 0.8091146550523858, |
|
"eval_loss": 0.9661393761634827, |
|
"eval_runtime": 365.1784, |
|
"eval_samples_per_second": 173.732, |
|
"eval_steps_per_second": 5.43, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.325814536340852, |
|
"grad_norm": 2.3634896278381348, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.933, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.325814536340852, |
|
"eval_accuracy": 0.808835748295871, |
|
"eval_loss": 0.9813971519470215, |
|
"eval_runtime": 381.116, |
|
"eval_samples_per_second": 166.466, |
|
"eval_steps_per_second": 5.203, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.378028404344194, |
|
"grad_norm": 2.9042844772338867, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.9295, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.378028404344194, |
|
"eval_accuracy": 0.8103665776447174, |
|
"eval_loss": 0.9754624366760254, |
|
"eval_runtime": 396.2254, |
|
"eval_samples_per_second": 160.118, |
|
"eval_steps_per_second": 5.005, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.430242272347535, |
|
"grad_norm": 2.7546634674072266, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.9367, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.430242272347535, |
|
"eval_accuracy": 0.8098531380075188, |
|
"eval_loss": 0.9821533560752869, |
|
"eval_runtime": 400.7427, |
|
"eval_samples_per_second": 158.314, |
|
"eval_steps_per_second": 4.948, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.482456140350878, |
|
"grad_norm": 2.8839492797851562, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.9292, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.482456140350878, |
|
"eval_accuracy": 0.8099045387069566, |
|
"eval_loss": 0.974047064781189, |
|
"eval_runtime": 399.7171, |
|
"eval_samples_per_second": 158.72, |
|
"eval_steps_per_second": 4.961, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.534670008354219, |
|
"grad_norm": 2.3632848262786865, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.9217, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.534670008354219, |
|
"eval_accuracy": 0.8100555138007703, |
|
"eval_loss": 0.9726797938346863, |
|
"eval_runtime": 398.2666, |
|
"eval_samples_per_second": 159.298, |
|
"eval_steps_per_second": 4.979, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.586883876357561, |
|
"grad_norm": 3.127169609069824, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.9294, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.586883876357561, |
|
"eval_accuracy": 0.8108817419721464, |
|
"eval_loss": 0.9585431218147278, |
|
"eval_runtime": 399.6498, |
|
"eval_samples_per_second": 158.746, |
|
"eval_steps_per_second": 4.962, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.639097744360902, |
|
"grad_norm": 2.53694748878479, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.9195, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.639097744360902, |
|
"eval_accuracy": 0.8119484709549883, |
|
"eval_loss": 0.957590639591217, |
|
"eval_runtime": 394.1783, |
|
"eval_samples_per_second": 160.95, |
|
"eval_steps_per_second": 5.031, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.6913116123642435, |
|
"grad_norm": 2.6188199520111084, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.9233, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.6913116123642435, |
|
"eval_accuracy": 0.8120400051070349, |
|
"eval_loss": 0.9545953273773193, |
|
"eval_runtime": 395.9182, |
|
"eval_samples_per_second": 160.243, |
|
"eval_steps_per_second": 5.009, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.743525480367586, |
|
"grad_norm": 2.502518653869629, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.9272, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.743525480367586, |
|
"eval_accuracy": 0.8122659898359591, |
|
"eval_loss": 0.95594322681427, |
|
"eval_runtime": 396.2822, |
|
"eval_samples_per_second": 160.096, |
|
"eval_steps_per_second": 5.004, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.795739348370927, |
|
"grad_norm": 2.527303457260132, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.9163, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.795739348370927, |
|
"eval_accuracy": 0.8124216614631093, |
|
"eval_loss": 0.9586062431335449, |
|
"eval_runtime": 402.0755, |
|
"eval_samples_per_second": 157.789, |
|
"eval_steps_per_second": 4.932, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.847953216374269, |
|
"grad_norm": 2.5056955814361572, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.9249, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.847953216374269, |
|
"eval_accuracy": 0.8123958167743631, |
|
"eval_loss": 0.9501298069953918, |
|
"eval_runtime": 397.3012, |
|
"eval_samples_per_second": 159.685, |
|
"eval_steps_per_second": 4.991, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.90016708437761, |
|
"grad_norm": 2.876765727996826, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.9167, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.90016708437761, |
|
"eval_accuracy": 0.8123634219518265, |
|
"eval_loss": 0.958740770816803, |
|
"eval_runtime": 393.8274, |
|
"eval_samples_per_second": 161.093, |
|
"eval_steps_per_second": 5.035, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.9523809523809526, |
|
"grad_norm": 2.457308769226074, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.9191, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.9523809523809526, |
|
"eval_accuracy": 0.8134356064347583, |
|
"eval_loss": 0.9526042938232422, |
|
"eval_runtime": 393.3254, |
|
"eval_samples_per_second": 161.299, |
|
"eval_steps_per_second": 5.042, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 6.004594820384294, |
|
"grad_norm": 2.489978075027466, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.9176, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.004594820384294, |
|
"eval_accuracy": 0.8137190312134606, |
|
"eval_loss": 0.9511122703552246, |
|
"eval_runtime": 398.7282, |
|
"eval_samples_per_second": 159.113, |
|
"eval_steps_per_second": 4.973, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.056808688387636, |
|
"grad_norm": 2.7550694942474365, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.9034, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.056808688387636, |
|
"eval_accuracy": 0.8137725525224172, |
|
"eval_loss": 0.9519580006599426, |
|
"eval_runtime": 397.3745, |
|
"eval_samples_per_second": 159.655, |
|
"eval_steps_per_second": 4.99, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.109022556390977, |
|
"grad_norm": 2.820486068725586, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.905, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 6.109022556390977, |
|
"eval_accuracy": 0.8139618722177003, |
|
"eval_loss": 0.9513248801231384, |
|
"eval_runtime": 401.2273, |
|
"eval_samples_per_second": 158.122, |
|
"eval_steps_per_second": 4.942, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 6.161236424394319, |
|
"grad_norm": 2.634119749069214, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.902, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.161236424394319, |
|
"eval_accuracy": 0.8135963134645056, |
|
"eval_loss": 0.9465650916099548, |
|
"eval_runtime": 402.0909, |
|
"eval_samples_per_second": 157.783, |
|
"eval_steps_per_second": 4.932, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.213450292397661, |
|
"grad_norm": 2.8152124881744385, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.9016, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.213450292397661, |
|
"eval_accuracy": 0.8144102857826985, |
|
"eval_loss": 0.9431130886077881, |
|
"eval_runtime": 399.633, |
|
"eval_samples_per_second": 158.753, |
|
"eval_steps_per_second": 4.962, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.265664160401003, |
|
"grad_norm": 2.478999137878418, |
|
"learning_rate": 4e-05, |
|
"loss": 0.9018, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.265664160401003, |
|
"eval_accuracy": 0.8148281916671716, |
|
"eval_loss": 0.9465348124504089, |
|
"eval_runtime": 401.047, |
|
"eval_samples_per_second": 158.193, |
|
"eval_steps_per_second": 4.945, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.317878028404344, |
|
"grad_norm": 2.4472334384918213, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.8894, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.317878028404344, |
|
"eval_accuracy": 0.8152884197022962, |
|
"eval_loss": 0.9336789846420288, |
|
"eval_runtime": 399.7265, |
|
"eval_samples_per_second": 158.716, |
|
"eval_steps_per_second": 4.961, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.370091896407686, |
|
"grad_norm": 2.6032795906066895, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.8861, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.370091896407686, |
|
"eval_accuracy": 0.8158878972595904, |
|
"eval_loss": 0.9414376616477966, |
|
"eval_runtime": 403.8156, |
|
"eval_samples_per_second": 157.109, |
|
"eval_steps_per_second": 4.911, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.4223057644110275, |
|
"grad_norm": 2.5782527923583984, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.8929, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.4223057644110275, |
|
"eval_accuracy": 0.8158353407843104, |
|
"eval_loss": 0.9369956254959106, |
|
"eval_runtime": 403.5486, |
|
"eval_samples_per_second": 157.213, |
|
"eval_steps_per_second": 4.914, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.474519632414369, |
|
"grad_norm": 2.3801369667053223, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.8869, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.474519632414369, |
|
"eval_accuracy": 0.8156502533666622, |
|
"eval_loss": 0.9334683418273926, |
|
"eval_runtime": 401.7901, |
|
"eval_samples_per_second": 157.901, |
|
"eval_steps_per_second": 4.935, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.526733500417711, |
|
"grad_norm": 2.44163179397583, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.8933, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.526733500417711, |
|
"eval_accuracy": 0.8168851117616432, |
|
"eval_loss": 0.9306558966636658, |
|
"eval_runtime": 397.9786, |
|
"eval_samples_per_second": 159.413, |
|
"eval_steps_per_second": 4.983, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"grad_norm": 2.2924203872680664, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.8862, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"eval_accuracy": 0.8162968497056878, |
|
"eval_loss": 0.9353703856468201, |
|
"eval_runtime": 394.7488, |
|
"eval_samples_per_second": 160.717, |
|
"eval_steps_per_second": 5.023, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.631161236424394, |
|
"grad_norm": 2.552828550338745, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.8948, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.631161236424394, |
|
"eval_accuracy": 0.8177748484660532, |
|
"eval_loss": 0.9219902157783508, |
|
"eval_runtime": 393.8065, |
|
"eval_samples_per_second": 161.102, |
|
"eval_steps_per_second": 5.035, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.683375104427736, |
|
"grad_norm": 2.423050880432129, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.8889, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.683375104427736, |
|
"eval_accuracy": 0.8173146779439671, |
|
"eval_loss": 0.9342746138572693, |
|
"eval_runtime": 393.0912, |
|
"eval_samples_per_second": 161.395, |
|
"eval_steps_per_second": 5.045, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.735588972431078, |
|
"grad_norm": 2.796268939971924, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.8885, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.735588972431078, |
|
"eval_accuracy": 0.817551689369001, |
|
"eval_loss": 0.9292306303977966, |
|
"eval_runtime": 394.1087, |
|
"eval_samples_per_second": 160.978, |
|
"eval_steps_per_second": 5.032, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.787802840434419, |
|
"grad_norm": 2.8128762245178223, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.8854, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.787802840434419, |
|
"eval_accuracy": 0.8176226440260885, |
|
"eval_loss": 0.9265376925468445, |
|
"eval_runtime": 394.2997, |
|
"eval_samples_per_second": 160.9, |
|
"eval_steps_per_second": 5.029, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.840016708437761, |
|
"grad_norm": 2.8697590827941895, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.88, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.840016708437761, |
|
"eval_accuracy": 0.8185052265994757, |
|
"eval_loss": 0.928463876247406, |
|
"eval_runtime": 394.323, |
|
"eval_samples_per_second": 160.891, |
|
"eval_steps_per_second": 5.029, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.8922305764411025, |
|
"grad_norm": 2.6351752281188965, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.8841, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.8922305764411025, |
|
"eval_accuracy": 0.8183555493481675, |
|
"eval_loss": 0.9194802045822144, |
|
"eval_runtime": 400.5152, |
|
"eval_samples_per_second": 158.403, |
|
"eval_steps_per_second": 4.951, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"grad_norm": 2.2477810382843018, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.8807, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"eval_accuracy": 0.8187747090050445, |
|
"eval_loss": 0.9138378500938416, |
|
"eval_runtime": 394.4731, |
|
"eval_samples_per_second": 160.83, |
|
"eval_steps_per_second": 5.027, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.996658312447786, |
|
"grad_norm": 2.1748201847076416, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.8836, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 6.996658312447786, |
|
"eval_accuracy": 0.8190003355856446, |
|
"eval_loss": 0.9168543815612793, |
|
"eval_runtime": 394.6502, |
|
"eval_samples_per_second": 160.758, |
|
"eval_steps_per_second": 5.025, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 7.048872180451128, |
|
"grad_norm": 2.7435665130615234, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.8707, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 7.048872180451128, |
|
"eval_accuracy": 0.8191917867246192, |
|
"eval_loss": 0.9231117367744446, |
|
"eval_runtime": 392.3745, |
|
"eval_samples_per_second": 161.69, |
|
"eval_steps_per_second": 5.054, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 7.101086048454469, |
|
"grad_norm": 2.569931745529175, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.8701, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.101086048454469, |
|
"eval_accuracy": 0.819969121038317, |
|
"eval_loss": 0.9153112173080444, |
|
"eval_runtime": 393.0875, |
|
"eval_samples_per_second": 161.397, |
|
"eval_steps_per_second": 5.045, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.1532999164578115, |
|
"grad_norm": 2.819972515106201, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.8656, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 7.1532999164578115, |
|
"eval_accuracy": 0.8194268014568229, |
|
"eval_loss": 0.9146177768707275, |
|
"eval_runtime": 394.2167, |
|
"eval_samples_per_second": 160.934, |
|
"eval_steps_per_second": 5.03, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 7.205513784461153, |
|
"grad_norm": 2.415208578109741, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.871, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.205513784461153, |
|
"eval_accuracy": 0.8198532803574815, |
|
"eval_loss": NaN, |
|
"eval_runtime": 395.7273, |
|
"eval_samples_per_second": 160.32, |
|
"eval_steps_per_second": 5.011, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.257727652464495, |
|
"grad_norm": 2.7900257110595703, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.8685, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.257727652464495, |
|
"eval_accuracy": 0.8207157152692325, |
|
"eval_loss": 0.9127238392829895, |
|
"eval_runtime": 392.7333, |
|
"eval_samples_per_second": 161.542, |
|
"eval_steps_per_second": 5.049, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.309941520467836, |
|
"grad_norm": 2.643833637237549, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8623, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.309941520467836, |
|
"eval_accuracy": 0.8202351749773661, |
|
"eval_loss": 0.9171856045722961, |
|
"eval_runtime": 393.097, |
|
"eval_samples_per_second": 161.393, |
|
"eval_steps_per_second": 5.045, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.362155388471178, |
|
"grad_norm": 2.5037946701049805, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.8601, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.362155388471178, |
|
"eval_accuracy": 0.8211525544791177, |
|
"eval_loss": 0.903904139995575, |
|
"eval_runtime": 394.3466, |
|
"eval_samples_per_second": 160.881, |
|
"eval_steps_per_second": 5.029, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.41436925647452, |
|
"grad_norm": 2.7093377113342285, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.8573, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.41436925647452, |
|
"eval_accuracy": 0.8214304681574243, |
|
"eval_loss": 0.9036524891853333, |
|
"eval_runtime": 393.5778, |
|
"eval_samples_per_second": 161.196, |
|
"eval_steps_per_second": 5.038, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.466583124477861, |
|
"grad_norm": 2.760674476623535, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.8633, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.466583124477861, |
|
"eval_accuracy": 0.8217283854693812, |
|
"eval_loss": 0.9000177979469299, |
|
"eval_runtime": 396.5628, |
|
"eval_samples_per_second": 159.982, |
|
"eval_steps_per_second": 5.0, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.518796992481203, |
|
"grad_norm": 2.831023693084717, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.8576, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.518796992481203, |
|
"eval_accuracy": 0.8211579127032714, |
|
"eval_loss": 0.918953001499176, |
|
"eval_runtime": 400.5863, |
|
"eval_samples_per_second": 158.375, |
|
"eval_steps_per_second": 4.95, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.571010860484545, |
|
"grad_norm": 2.453390598297119, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.854, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.571010860484545, |
|
"eval_accuracy": 0.8220418032437955, |
|
"eval_loss": 0.9041558504104614, |
|
"eval_runtime": 399.249, |
|
"eval_samples_per_second": 158.906, |
|
"eval_steps_per_second": 4.967, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.6232247284878865, |
|
"grad_norm": 2.6029062271118164, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.8615, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.6232247284878865, |
|
"eval_accuracy": 0.8223261586980396, |
|
"eval_loss": 0.8994259834289551, |
|
"eval_runtime": 387.0421, |
|
"eval_samples_per_second": 163.918, |
|
"eval_steps_per_second": 5.123, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.675438596491228, |
|
"grad_norm": 2.3841817378997803, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.8632, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.675438596491228, |
|
"eval_accuracy": 0.8218723618637837, |
|
"eval_loss": 0.9080346822738647, |
|
"eval_runtime": 368.6926, |
|
"eval_samples_per_second": 172.076, |
|
"eval_steps_per_second": 5.378, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.72765246449457, |
|
"grad_norm": 3.8187203407287598, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.8519, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.72765246449457, |
|
"eval_accuracy": 0.8223824804514319, |
|
"eval_loss": 0.9071189761161804, |
|
"eval_runtime": 367.458, |
|
"eval_samples_per_second": 172.654, |
|
"eval_steps_per_second": 5.397, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.779866332497911, |
|
"grad_norm": 2.744349241256714, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.8569, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.779866332497911, |
|
"eval_accuracy": 0.8229439133116854, |
|
"eval_loss": 0.9003260731697083, |
|
"eval_runtime": 369.188, |
|
"eval_samples_per_second": 171.845, |
|
"eval_steps_per_second": 5.371, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.832080200501253, |
|
"grad_norm": 2.3172342777252197, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.8502, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.832080200501253, |
|
"eval_accuracy": 0.8229715307093431, |
|
"eval_loss": 0.9045436382293701, |
|
"eval_runtime": 368.1168, |
|
"eval_samples_per_second": 172.345, |
|
"eval_steps_per_second": 5.387, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.884294068504595, |
|
"grad_norm": 2.6108663082122803, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.8536, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.884294068504595, |
|
"eval_accuracy": 0.8234089591247552, |
|
"eval_loss": 0.9026762247085571, |
|
"eval_runtime": 366.8391, |
|
"eval_samples_per_second": 172.945, |
|
"eval_steps_per_second": 5.406, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.936507936507937, |
|
"grad_norm": 2.315197467803955, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.85, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.936507936507937, |
|
"eval_accuracy": 0.8232359994242249, |
|
"eval_loss": 0.8995440006256104, |
|
"eval_runtime": 367.8582, |
|
"eval_samples_per_second": 172.466, |
|
"eval_steps_per_second": 5.391, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.988721804511278, |
|
"grad_norm": 2.611060380935669, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.8453, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 7.988721804511278, |
|
"eval_accuracy": 0.8234605237364002, |
|
"eval_loss": 0.8912914395332336, |
|
"eval_runtime": 367.6079, |
|
"eval_samples_per_second": 172.583, |
|
"eval_steps_per_second": 5.394, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 8.04093567251462, |
|
"grad_norm": 2.4672865867614746, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.8294, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.04093567251462, |
|
"eval_accuracy": 0.8244379262655704, |
|
"eval_loss": 0.896998405456543, |
|
"eval_runtime": 367.8054, |
|
"eval_samples_per_second": 172.491, |
|
"eval_steps_per_second": 5.391, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.093149540517961, |
|
"grad_norm": 2.645519495010376, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.849, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 8.093149540517961, |
|
"eval_accuracy": 0.8244201499571945, |
|
"eval_loss": 0.8957926034927368, |
|
"eval_runtime": 369.9508, |
|
"eval_samples_per_second": 171.49, |
|
"eval_steps_per_second": 5.36, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 8.145363408521304, |
|
"grad_norm": 2.7964978218078613, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.8363, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.145363408521304, |
|
"eval_accuracy": 0.8244232177017738, |
|
"eval_loss": 0.8884155750274658, |
|
"eval_runtime": 367.358, |
|
"eval_samples_per_second": 172.701, |
|
"eval_steps_per_second": 5.398, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.197577276524646, |
|
"grad_norm": 2.48044490814209, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.8355, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 8.197577276524646, |
|
"eval_accuracy": 0.82532828384256, |
|
"eval_loss": 0.8786827325820923, |
|
"eval_runtime": 366.707, |
|
"eval_samples_per_second": 173.007, |
|
"eval_steps_per_second": 5.408, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 8.249791144527986, |
|
"grad_norm": 2.6931161880493164, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.8439, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.249791144527986, |
|
"eval_accuracy": 0.8246382720646043, |
|
"eval_loss": 0.8848527073860168, |
|
"eval_runtime": 366.9894, |
|
"eval_samples_per_second": 172.874, |
|
"eval_steps_per_second": 5.403, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.302005012531328, |
|
"grad_norm": 2.607593297958374, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.8378, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.302005012531328, |
|
"eval_accuracy": 0.8252490751457086, |
|
"eval_loss": NaN, |
|
"eval_runtime": 366.5162, |
|
"eval_samples_per_second": 173.097, |
|
"eval_steps_per_second": 5.41, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.35421888053467, |
|
"grad_norm": 2.4364349842071533, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8322, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.35421888053467, |
|
"eval_accuracy": 0.8252812537780041, |
|
"eval_loss": NaN, |
|
"eval_runtime": 367.8653, |
|
"eval_samples_per_second": 172.463, |
|
"eval_steps_per_second": 5.391, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.406432748538013, |
|
"grad_norm": 2.541379928588867, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.8366, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.406432748538013, |
|
"eval_accuracy": 0.8252309925385912, |
|
"eval_loss": 0.9011866450309753, |
|
"eval_runtime": 367.843, |
|
"eval_samples_per_second": 172.473, |
|
"eval_steps_per_second": 5.391, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.458646616541353, |
|
"grad_norm": 2.3451640605926514, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.8313, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.458646616541353, |
|
"eval_accuracy": 0.8259800217048817, |
|
"eval_loss": 0.8837263584136963, |
|
"eval_runtime": 367.6187, |
|
"eval_samples_per_second": 172.578, |
|
"eval_steps_per_second": 5.394, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.510860484544695, |
|
"grad_norm": 2.6211562156677246, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.8289, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.510860484544695, |
|
"eval_accuracy": 0.8257132907861338, |
|
"eval_loss": 0.8861507773399353, |
|
"eval_runtime": 367.3072, |
|
"eval_samples_per_second": 172.725, |
|
"eval_steps_per_second": 5.399, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.563074352548037, |
|
"grad_norm": 2.1193125247955322, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.8337, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.563074352548037, |
|
"eval_accuracy": 0.8263971078151469, |
|
"eval_loss": 0.8711854815483093, |
|
"eval_runtime": 366.8982, |
|
"eval_samples_per_second": 172.917, |
|
"eval_steps_per_second": 5.405, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.615288220551378, |
|
"grad_norm": 2.5682766437530518, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.8207, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.615288220551378, |
|
"eval_accuracy": 0.8263459457288101, |
|
"eval_loss": 0.8814241886138916, |
|
"eval_runtime": 366.9346, |
|
"eval_samples_per_second": 172.9, |
|
"eval_steps_per_second": 5.404, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.66750208855472, |
|
"grad_norm": 2.460284471511841, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.8273, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.66750208855472, |
|
"eval_accuracy": 0.8265486834185438, |
|
"eval_loss": 0.8726556301116943, |
|
"eval_runtime": 367.479, |
|
"eval_samples_per_second": 172.644, |
|
"eval_steps_per_second": 5.396, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.719715956558062, |
|
"grad_norm": 2.519878625869751, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.8212, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.719715956558062, |
|
"eval_accuracy": 0.8265830712467154, |
|
"eval_loss": 0.8803038001060486, |
|
"eval_runtime": 368.0421, |
|
"eval_samples_per_second": 172.38, |
|
"eval_steps_per_second": 5.388, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.771929824561404, |
|
"grad_norm": 2.9779090881347656, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.8267, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.771929824561404, |
|
"eval_accuracy": 0.8266352426703358, |
|
"eval_loss": NaN, |
|
"eval_runtime": 370.1684, |
|
"eval_samples_per_second": 171.39, |
|
"eval_steps_per_second": 5.357, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.824143692564745, |
|
"grad_norm": 2.5584726333618164, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.8236, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.824143692564745, |
|
"eval_accuracy": 0.8272979799699072, |
|
"eval_loss": 0.8709605932235718, |
|
"eval_runtime": 367.2687, |
|
"eval_samples_per_second": 172.743, |
|
"eval_steps_per_second": 5.399, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.876357560568087, |
|
"grad_norm": 2.8752248287200928, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.8284, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.876357560568087, |
|
"eval_accuracy": 0.8270593176674245, |
|
"eval_loss": 0.8703105449676514, |
|
"eval_runtime": 367.121, |
|
"eval_samples_per_second": 172.812, |
|
"eval_steps_per_second": 5.401, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.928571428571429, |
|
"grad_norm": 2.441423177719116, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.8305, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.928571428571429, |
|
"eval_accuracy": 0.8273055974821587, |
|
"eval_loss": 0.8782740831375122, |
|
"eval_runtime": 367.2074, |
|
"eval_samples_per_second": 172.772, |
|
"eval_steps_per_second": 5.4, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.980785296574771, |
|
"grad_norm": 2.5627357959747314, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.824, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 8.980785296574771, |
|
"eval_accuracy": 0.8278787560948533, |
|
"eval_loss": 0.8767423033714294, |
|
"eval_runtime": 367.8162, |
|
"eval_samples_per_second": 172.486, |
|
"eval_steps_per_second": 5.391, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 9.032999164578111, |
|
"grad_norm": 2.9659039974212646, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.8111, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 9.032999164578111, |
|
"eval_accuracy": 0.8281429418603345, |
|
"eval_loss": NaN, |
|
"eval_runtime": 365.8491, |
|
"eval_samples_per_second": 173.413, |
|
"eval_steps_per_second": 5.42, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 9.085213032581454, |
|
"grad_norm": 2.6677966117858887, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.8114, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.085213032581454, |
|
"eval_accuracy": 0.8280896401382127, |
|
"eval_loss": 0.8696718215942383, |
|
"eval_runtime": 368.939, |
|
"eval_samples_per_second": 171.961, |
|
"eval_steps_per_second": 5.375, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.137426900584796, |
|
"grad_norm": 2.8159141540527344, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.8214, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 9.137426900584796, |
|
"eval_accuracy": 0.8287721660611849, |
|
"eval_loss": 0.865568995475769, |
|
"eval_runtime": 367.5282, |
|
"eval_samples_per_second": 172.621, |
|
"eval_steps_per_second": 5.396, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 9.189640768588138, |
|
"grad_norm": 2.44439959526062, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.8117, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.189640768588138, |
|
"eval_accuracy": 0.8286521507089889, |
|
"eval_loss": 0.8658538460731506, |
|
"eval_runtime": 365.9254, |
|
"eval_samples_per_second": 173.377, |
|
"eval_steps_per_second": 5.419, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.241854636591478, |
|
"grad_norm": 2.686903953552246, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.8176, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 9.241854636591478, |
|
"eval_accuracy": 0.8286974413686677, |
|
"eval_loss": 0.8724528551101685, |
|
"eval_runtime": 366.9175, |
|
"eval_samples_per_second": 172.908, |
|
"eval_steps_per_second": 5.404, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 9.29406850459482, |
|
"grad_norm": 2.9780402183532715, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.8098, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.29406850459482, |
|
"eval_accuracy": 0.8288113105821936, |
|
"eval_loss": 0.8693479895591736, |
|
"eval_runtime": 366.5706, |
|
"eval_samples_per_second": 173.072, |
|
"eval_steps_per_second": 5.41, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.346282372598163, |
|
"grad_norm": 2.3616678714752197, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.8101, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.346282372598163, |
|
"eval_accuracy": 0.8288534544077049, |
|
"eval_loss": 0.8676702380180359, |
|
"eval_runtime": 367.6956, |
|
"eval_samples_per_second": 172.542, |
|
"eval_steps_per_second": 5.393, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.398496240601503, |
|
"grad_norm": 2.4607014656066895, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8074, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.398496240601503, |
|
"eval_accuracy": 0.8293368416792034, |
|
"eval_loss": 0.868602454662323, |
|
"eval_runtime": 370.3428, |
|
"eval_samples_per_second": 171.309, |
|
"eval_steps_per_second": 5.354, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.450710108604845, |
|
"grad_norm": 2.303865432739258, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.8075, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.450710108604845, |
|
"eval_accuracy": 0.829557819292624, |
|
"eval_loss": 0.8760720491409302, |
|
"eval_runtime": 366.9515, |
|
"eval_samples_per_second": 172.892, |
|
"eval_steps_per_second": 5.404, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.502923976608187, |
|
"grad_norm": 3.1800107955932617, |
|
"learning_rate": 9e-06, |
|
"loss": 0.813, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.502923976608187, |
|
"eval_accuracy": 0.8293719470428488, |
|
"eval_loss": 0.8725053668022156, |
|
"eval_runtime": 366.7268, |
|
"eval_samples_per_second": 172.998, |
|
"eval_steps_per_second": 5.407, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.55513784461153, |
|
"grad_norm": 2.895458698272705, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.8142, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.55513784461153, |
|
"eval_accuracy": 0.8299493767535929, |
|
"eval_loss": 0.8597660660743713, |
|
"eval_runtime": 366.9591, |
|
"eval_samples_per_second": 172.888, |
|
"eval_steps_per_second": 5.404, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.60735171261487, |
|
"grad_norm": 2.494898796081543, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.8028, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.60735171261487, |
|
"eval_accuracy": 0.8296925250842329, |
|
"eval_loss": 0.8663123250007629, |
|
"eval_runtime": 366.9071, |
|
"eval_samples_per_second": 172.913, |
|
"eval_steps_per_second": 5.405, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.659565580618212, |
|
"grad_norm": 2.6649532318115234, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.8051, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.659565580618212, |
|
"eval_accuracy": 0.8297183234153672, |
|
"eval_loss": 0.8609718084335327, |
|
"eval_runtime": 369.3381, |
|
"eval_samples_per_second": 171.775, |
|
"eval_steps_per_second": 5.369, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.711779448621554, |
|
"grad_norm": 2.708948850631714, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.7982, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.711779448621554, |
|
"eval_accuracy": 0.8295886536989987, |
|
"eval_loss": 0.8697434067726135, |
|
"eval_runtime": 367.9632, |
|
"eval_samples_per_second": 172.417, |
|
"eval_steps_per_second": 5.389, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.763993316624896, |
|
"grad_norm": 2.320215940475464, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.7985, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.763993316624896, |
|
"eval_accuracy": 0.8301079266760504, |
|
"eval_loss": 0.8655940890312195, |
|
"eval_runtime": 366.2895, |
|
"eval_samples_per_second": 173.205, |
|
"eval_steps_per_second": 5.414, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.816207184628237, |
|
"grad_norm": 2.776233673095703, |
|
"learning_rate": 6e-06, |
|
"loss": 0.8016, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.816207184628237, |
|
"eval_accuracy": 0.8302324108540118, |
|
"eval_loss": 0.8625151515007019, |
|
"eval_runtime": 367.8541, |
|
"eval_samples_per_second": 172.468, |
|
"eval_steps_per_second": 5.391, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.868421052631579, |
|
"grad_norm": 2.461749792098999, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.8044, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.868421052631579, |
|
"eval_accuracy": 0.8303814560823992, |
|
"eval_loss": NaN, |
|
"eval_runtime": 366.9598, |
|
"eval_samples_per_second": 172.888, |
|
"eval_steps_per_second": 5.404, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.920634920634921, |
|
"grad_norm": 2.611931085586548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.920634920634921, |
|
"eval_accuracy": 0.8306366115947627, |
|
"eval_loss": 0.8623033165931702, |
|
"eval_runtime": 368.1962, |
|
"eval_samples_per_second": 172.308, |
|
"eval_steps_per_second": 5.386, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.972848788638263, |
|
"grad_norm": 2.500162363052368, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.7972, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 9.972848788638263, |
|
"eval_accuracy": 0.8306005889087491, |
|
"eval_loss": 0.8600591421127319, |
|
"eval_runtime": 369.0733, |
|
"eval_samples_per_second": 171.898, |
|
"eval_steps_per_second": 5.373, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 10.025062656641603, |
|
"grad_norm": 3.495044231414795, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7929, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.025062656641603, |
|
"eval_accuracy": 0.8308683319034997, |
|
"eval_loss": 0.8547109961509705, |
|
"eval_runtime": 368.4222, |
|
"eval_samples_per_second": 172.202, |
|
"eval_steps_per_second": 5.382, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.077276524644946, |
|
"grad_norm": 2.7070469856262207, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.7903, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 10.077276524644946, |
|
"eval_accuracy": 0.8307715292279126, |
|
"eval_loss": 0.8600655198097229, |
|
"eval_runtime": 366.9772, |
|
"eval_samples_per_second": 172.88, |
|
"eval_steps_per_second": 5.404, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 10.129490392648288, |
|
"grad_norm": 3.4624171257019043, |
|
"learning_rate": 3e-06, |
|
"loss": 0.7898, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.129490392648288, |
|
"eval_accuracy": 0.831032096060647, |
|
"eval_loss": 0.853934109210968, |
|
"eval_runtime": 368.4043, |
|
"eval_samples_per_second": 172.21, |
|
"eval_steps_per_second": 5.383, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.18170426065163, |
|
"grad_norm": 2.6027026176452637, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.7994, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 10.18170426065163, |
|
"eval_accuracy": 0.8308584971832438, |
|
"eval_loss": 0.8530024886131287, |
|
"eval_runtime": 366.5565, |
|
"eval_samples_per_second": 173.078, |
|
"eval_steps_per_second": 5.41, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 10.23391812865497, |
|
"grad_norm": 2.5377743244171143, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.7926, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.23391812865497, |
|
"eval_accuracy": 0.8310957564926591, |
|
"eval_loss": 0.8588021993637085, |
|
"eval_runtime": 366.0793, |
|
"eval_samples_per_second": 173.304, |
|
"eval_steps_per_second": 5.417, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.286131996658312, |
|
"grad_norm": 2.9179139137268066, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.785, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 10.286131996658312, |
|
"eval_accuracy": 0.8314594660073593, |
|
"eval_loss": 0.8528650403022766, |
|
"eval_runtime": 366.8402, |
|
"eval_samples_per_second": 172.945, |
|
"eval_steps_per_second": 5.406, |
|
"step": 98500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 11, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.304812876669911e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|