mbert_dan-latn / trainer_state.json
DGurgurov's picture
Uploading checkpoint-98500 for mbert - dan-latn
4204957 verified
{
"best_metric": 0.8528650403022766,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/dan-Latn/checkpoint-98500",
"epoch": 10.286131996658312,
"eval_steps": 500,
"global_step": 98500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.052213868003341685,
"grad_norm": 4.64046049118042,
"learning_rate": 9.95e-05,
"loss": 1.7666,
"step": 500
},
{
"epoch": 0.052213868003341685,
"eval_accuracy": 0.6961985245587479,
"eval_loss": 1.741060733795166,
"eval_runtime": 361.3155,
"eval_samples_per_second": 175.589,
"eval_steps_per_second": 5.488,
"step": 500
},
{
"epoch": 0.10442773600668337,
"grad_norm": 3.5290920734405518,
"learning_rate": 9.900000000000001e-05,
"loss": 1.6122,
"step": 1000
},
{
"epoch": 0.10442773600668337,
"eval_accuracy": 0.7084748820521898,
"eval_loss": 1.6247525215148926,
"eval_runtime": 360.2132,
"eval_samples_per_second": 176.126,
"eval_steps_per_second": 5.505,
"step": 1000
},
{
"epoch": 0.15664160401002505,
"grad_norm": 3.951317071914673,
"learning_rate": 9.850000000000001e-05,
"loss": 1.5487,
"step": 1500
},
{
"epoch": 0.15664160401002505,
"eval_accuracy": 0.718550082660156,
"eval_loss": 1.553884506225586,
"eval_runtime": 361.2942,
"eval_samples_per_second": 175.599,
"eval_steps_per_second": 5.489,
"step": 1500
},
{
"epoch": 0.20885547201336674,
"grad_norm": 3.799274444580078,
"learning_rate": 9.8e-05,
"loss": 1.4946,
"step": 2000
},
{
"epoch": 0.20885547201336674,
"eval_accuracy": 0.723134633812131,
"eval_loss": 1.5116215944290161,
"eval_runtime": 360.3813,
"eval_samples_per_second": 176.044,
"eval_steps_per_second": 5.503,
"step": 2000
},
{
"epoch": 0.26106934001670845,
"grad_norm": 3.499929428100586,
"learning_rate": 9.75e-05,
"loss": 1.4573,
"step": 2500
},
{
"epoch": 0.26106934001670845,
"eval_accuracy": 0.7287729394802086,
"eval_loss": 1.4748759269714355,
"eval_runtime": 362.14,
"eval_samples_per_second": 175.189,
"eval_steps_per_second": 5.476,
"step": 2500
},
{
"epoch": 0.3132832080200501,
"grad_norm": 3.6163783073425293,
"learning_rate": 9.7e-05,
"loss": 1.4332,
"step": 3000
},
{
"epoch": 0.3132832080200501,
"eval_accuracy": 0.7334675822291515,
"eval_loss": 1.4382221698760986,
"eval_runtime": 362.0967,
"eval_samples_per_second": 175.21,
"eval_steps_per_second": 5.476,
"step": 3000
},
{
"epoch": 0.3654970760233918,
"grad_norm": 3.382498264312744,
"learning_rate": 9.65e-05,
"loss": 1.3914,
"step": 3500
},
{
"epoch": 0.3654970760233918,
"eval_accuracy": 0.7367988898749408,
"eval_loss": 1.4078316688537598,
"eval_runtime": 362.0182,
"eval_samples_per_second": 175.248,
"eval_steps_per_second": 5.478,
"step": 3500
},
{
"epoch": 0.4177109440267335,
"grad_norm": 3.2594001293182373,
"learning_rate": 9.6e-05,
"loss": 1.3843,
"step": 4000
},
{
"epoch": 0.4177109440267335,
"eval_accuracy": 0.7404781561454725,
"eval_loss": 1.37874174118042,
"eval_runtime": 362.2257,
"eval_samples_per_second": 175.148,
"eval_steps_per_second": 5.474,
"step": 4000
},
{
"epoch": 0.4699248120300752,
"grad_norm": 3.717684268951416,
"learning_rate": 9.55e-05,
"loss": 1.3628,
"step": 4500
},
{
"epoch": 0.4699248120300752,
"eval_accuracy": 0.7430362394212798,
"eval_loss": 1.373829960823059,
"eval_runtime": 362.0885,
"eval_samples_per_second": 175.214,
"eval_steps_per_second": 5.477,
"step": 4500
},
{
"epoch": 0.5221386800334169,
"grad_norm": 3.67509126663208,
"learning_rate": 9.5e-05,
"loss": 1.356,
"step": 5000
},
{
"epoch": 0.5221386800334169,
"eval_accuracy": 0.7465132147816155,
"eval_loss": 1.343856930732727,
"eval_runtime": 362.013,
"eval_samples_per_second": 175.251,
"eval_steps_per_second": 5.478,
"step": 5000
},
{
"epoch": 0.5743525480367586,
"grad_norm": 2.995349407196045,
"learning_rate": 9.449999999999999e-05,
"loss": 1.3275,
"step": 5500
},
{
"epoch": 0.5743525480367586,
"eval_accuracy": 0.747906382026035,
"eval_loss": 1.3185055255889893,
"eval_runtime": 363.6082,
"eval_samples_per_second": 174.482,
"eval_steps_per_second": 5.454,
"step": 5500
},
{
"epoch": 0.6265664160401002,
"grad_norm": 3.0733938217163086,
"learning_rate": 9.4e-05,
"loss": 1.3192,
"step": 6000
},
{
"epoch": 0.6265664160401002,
"eval_accuracy": 0.7498967127579244,
"eval_loss": 1.3267806768417358,
"eval_runtime": 362.4596,
"eval_samples_per_second": 175.035,
"eval_steps_per_second": 5.471,
"step": 6000
},
{
"epoch": 0.6787802840434419,
"grad_norm": 3.0954840183258057,
"learning_rate": 9.350000000000001e-05,
"loss": 1.3123,
"step": 6500
},
{
"epoch": 0.6787802840434419,
"eval_accuracy": 0.7517548935891929,
"eval_loss": 1.3006523847579956,
"eval_runtime": 363.2093,
"eval_samples_per_second": 174.673,
"eval_steps_per_second": 5.46,
"step": 6500
},
{
"epoch": 0.7309941520467836,
"grad_norm": 3.1289381980895996,
"learning_rate": 9.300000000000001e-05,
"loss": 1.2942,
"step": 7000
},
{
"epoch": 0.7309941520467836,
"eval_accuracy": 0.7542296938604579,
"eval_loss": 1.2929292917251587,
"eval_runtime": 362.9614,
"eval_samples_per_second": 174.793,
"eval_steps_per_second": 5.463,
"step": 7000
},
{
"epoch": 0.7832080200501254,
"grad_norm": 3.440656900405884,
"learning_rate": 9.250000000000001e-05,
"loss": 1.2812,
"step": 7500
},
{
"epoch": 0.7832080200501254,
"eval_accuracy": 0.756235706841684,
"eval_loss": 1.2668538093566895,
"eval_runtime": 362.1254,
"eval_samples_per_second": 175.196,
"eval_steps_per_second": 5.476,
"step": 7500
},
{
"epoch": 0.835421888053467,
"grad_norm": 4.0889458656311035,
"learning_rate": 9.200000000000001e-05,
"loss": 1.2708,
"step": 8000
},
{
"epoch": 0.835421888053467,
"eval_accuracy": 0.7571290622439745,
"eval_loss": 1.283825159072876,
"eval_runtime": 362.2084,
"eval_samples_per_second": 175.156,
"eval_steps_per_second": 5.475,
"step": 8000
},
{
"epoch": 0.8876357560568087,
"grad_norm": 2.77811598777771,
"learning_rate": 9.15e-05,
"loss": 1.2759,
"step": 8500
},
{
"epoch": 0.8876357560568087,
"eval_accuracy": 0.7586107082301347,
"eval_loss": 1.2615000009536743,
"eval_runtime": 362.3765,
"eval_samples_per_second": 175.075,
"eval_steps_per_second": 5.472,
"step": 8500
},
{
"epoch": 0.9398496240601504,
"grad_norm": 3.350158929824829,
"learning_rate": 9.1e-05,
"loss": 1.257,
"step": 9000
},
{
"epoch": 0.9398496240601504,
"eval_accuracy": 0.7595859575845869,
"eval_loss": 1.250738501548767,
"eval_runtime": 362.5674,
"eval_samples_per_second": 174.983,
"eval_steps_per_second": 5.469,
"step": 9000
},
{
"epoch": 0.9920634920634921,
"grad_norm": 3.2566370964050293,
"learning_rate": 9.05e-05,
"loss": 1.2437,
"step": 9500
},
{
"epoch": 0.9920634920634921,
"eval_accuracy": 0.7619539601093824,
"eval_loss": 1.248534083366394,
"eval_runtime": 362.5746,
"eval_samples_per_second": 174.979,
"eval_steps_per_second": 5.469,
"step": 9500
},
{
"epoch": 1.0442773600668338,
"grad_norm": 3.279320001602173,
"learning_rate": 9e-05,
"loss": 1.2333,
"step": 10000
},
{
"epoch": 1.0442773600668338,
"eval_accuracy": 0.762933591347759,
"eval_loss": 1.2348397970199585,
"eval_runtime": 362.8313,
"eval_samples_per_second": 174.855,
"eval_steps_per_second": 5.465,
"step": 10000
},
{
"epoch": 1.0964912280701755,
"grad_norm": 3.002204418182373,
"learning_rate": 8.950000000000001e-05,
"loss": 1.2204,
"step": 10500
},
{
"epoch": 1.0964912280701755,
"eval_accuracy": 0.7645752968362766,
"eval_loss": 1.2361088991165161,
"eval_runtime": 362.7427,
"eval_samples_per_second": 174.898,
"eval_steps_per_second": 5.467,
"step": 10500
},
{
"epoch": 1.1487050960735172,
"grad_norm": 4.058037281036377,
"learning_rate": 8.900000000000001e-05,
"loss": 1.2116,
"step": 11000
},
{
"epoch": 1.1487050960735172,
"eval_accuracy": 0.7654839256801539,
"eval_loss": 1.2149455547332764,
"eval_runtime": 362.0743,
"eval_samples_per_second": 175.221,
"eval_steps_per_second": 5.477,
"step": 11000
},
{
"epoch": 1.2009189640768587,
"grad_norm": 3.364161968231201,
"learning_rate": 8.850000000000001e-05,
"loss": 1.2083,
"step": 11500
},
{
"epoch": 1.2009189640768587,
"eval_accuracy": 0.7664035053683952,
"eval_loss": 1.2015998363494873,
"eval_runtime": 362.482,
"eval_samples_per_second": 175.024,
"eval_steps_per_second": 5.471,
"step": 11500
},
{
"epoch": 1.2531328320802004,
"grad_norm": 2.995591402053833,
"learning_rate": 8.800000000000001e-05,
"loss": 1.2056,
"step": 12000
},
{
"epoch": 1.2531328320802004,
"eval_accuracy": 0.76765140727518,
"eval_loss": 1.1961411237716675,
"eval_runtime": 363.557,
"eval_samples_per_second": 174.506,
"eval_steps_per_second": 5.454,
"step": 12000
},
{
"epoch": 1.3053467000835421,
"grad_norm": 3.010730028152466,
"learning_rate": 8.75e-05,
"loss": 1.1917,
"step": 12500
},
{
"epoch": 1.3053467000835421,
"eval_accuracy": 0.7685342108740758,
"eval_loss": 1.1992570161819458,
"eval_runtime": 362.5156,
"eval_samples_per_second": 175.008,
"eval_steps_per_second": 5.47,
"step": 12500
},
{
"epoch": 1.3575605680868839,
"grad_norm": 3.052395820617676,
"learning_rate": 8.7e-05,
"loss": 1.1905,
"step": 13000
},
{
"epoch": 1.3575605680868839,
"eval_accuracy": 0.7699597470201401,
"eval_loss": 1.2011651992797852,
"eval_runtime": 361.326,
"eval_samples_per_second": 175.584,
"eval_steps_per_second": 5.488,
"step": 13000
},
{
"epoch": 1.4097744360902256,
"grad_norm": 2.883955717086792,
"learning_rate": 8.65e-05,
"loss": 1.1839,
"step": 13500
},
{
"epoch": 1.4097744360902256,
"eval_accuracy": 0.7708384570352297,
"eval_loss": 1.1753507852554321,
"eval_runtime": 362.9321,
"eval_samples_per_second": 174.807,
"eval_steps_per_second": 5.464,
"step": 13500
},
{
"epoch": 1.4619883040935673,
"grad_norm": 2.7026546001434326,
"learning_rate": 8.6e-05,
"loss": 1.1724,
"step": 14000
},
{
"epoch": 1.4619883040935673,
"eval_accuracy": 0.7709277011489558,
"eval_loss": 1.168317198753357,
"eval_runtime": 362.8518,
"eval_samples_per_second": 174.845,
"eval_steps_per_second": 5.465,
"step": 14000
},
{
"epoch": 1.514202172096909,
"grad_norm": 2.8576297760009766,
"learning_rate": 8.55e-05,
"loss": 1.1758,
"step": 14500
},
{
"epoch": 1.514202172096909,
"eval_accuracy": 0.7720082438542706,
"eval_loss": 1.1711252927780151,
"eval_runtime": 362.6486,
"eval_samples_per_second": 174.943,
"eval_steps_per_second": 5.468,
"step": 14500
},
{
"epoch": 1.5664160401002505,
"grad_norm": 3.314800500869751,
"learning_rate": 8.5e-05,
"loss": 1.1656,
"step": 15000
},
{
"epoch": 1.5664160401002505,
"eval_accuracy": 0.7731014057112545,
"eval_loss": NaN,
"eval_runtime": 362.4489,
"eval_samples_per_second": 175.04,
"eval_steps_per_second": 5.471,
"step": 15000
},
{
"epoch": 1.6186299081035922,
"grad_norm": 2.740029811859131,
"learning_rate": 8.450000000000001e-05,
"loss": 1.1659,
"step": 15500
},
{
"epoch": 1.6186299081035922,
"eval_accuracy": 0.7745773535784676,
"eval_loss": 1.1655621528625488,
"eval_runtime": 362.0759,
"eval_samples_per_second": 175.22,
"eval_steps_per_second": 5.477,
"step": 15500
},
{
"epoch": 1.670843776106934,
"grad_norm": 2.9377198219299316,
"learning_rate": 8.4e-05,
"loss": 1.1512,
"step": 16000
},
{
"epoch": 1.670843776106934,
"eval_accuracy": 0.7745256599790964,
"eval_loss": 1.1678109169006348,
"eval_runtime": 362.3938,
"eval_samples_per_second": 175.066,
"eval_steps_per_second": 5.472,
"step": 16000
},
{
"epoch": 1.7230576441102756,
"grad_norm": 3.2804691791534424,
"learning_rate": 8.35e-05,
"loss": 1.1394,
"step": 16500
},
{
"epoch": 1.7230576441102756,
"eval_accuracy": 0.7751363378115868,
"eval_loss": 1.1540894508361816,
"eval_runtime": 362.0672,
"eval_samples_per_second": 175.224,
"eval_steps_per_second": 5.477,
"step": 16500
},
{
"epoch": 1.7752715121136173,
"grad_norm": 3.486011028289795,
"learning_rate": 8.3e-05,
"loss": 1.1382,
"step": 17000
},
{
"epoch": 1.7752715121136173,
"eval_accuracy": 0.7765838022142181,
"eval_loss": 1.1509926319122314,
"eval_runtime": 362.2521,
"eval_samples_per_second": 175.135,
"eval_steps_per_second": 5.474,
"step": 17000
},
{
"epoch": 1.827485380116959,
"grad_norm": 2.56192684173584,
"learning_rate": 8.25e-05,
"loss": 1.1448,
"step": 17500
},
{
"epoch": 1.827485380116959,
"eval_accuracy": 0.7774433499499322,
"eval_loss": 1.150856375694275,
"eval_runtime": 361.437,
"eval_samples_per_second": 175.53,
"eval_steps_per_second": 5.486,
"step": 17500
},
{
"epoch": 1.8796992481203008,
"grad_norm": 2.9113447666168213,
"learning_rate": 8.2e-05,
"loss": 1.1376,
"step": 18000
},
{
"epoch": 1.8796992481203008,
"eval_accuracy": 0.7779668095978695,
"eval_loss": 1.1359950304031372,
"eval_runtime": 362.1516,
"eval_samples_per_second": 175.184,
"eval_steps_per_second": 5.476,
"step": 18000
},
{
"epoch": 1.9319131161236425,
"grad_norm": 2.8685195446014404,
"learning_rate": 8.15e-05,
"loss": 1.1306,
"step": 18500
},
{
"epoch": 1.9319131161236425,
"eval_accuracy": 0.7786356827454954,
"eval_loss": 1.1356443166732788,
"eval_runtime": 367.0737,
"eval_samples_per_second": 172.835,
"eval_steps_per_second": 5.402,
"step": 18500
},
{
"epoch": 1.9841269841269842,
"grad_norm": 3.4618313312530518,
"learning_rate": 8.1e-05,
"loss": 1.1379,
"step": 19000
},
{
"epoch": 1.9841269841269842,
"eval_accuracy": 0.7796485696787286,
"eval_loss": 1.1357932090759277,
"eval_runtime": 367.2471,
"eval_samples_per_second": 172.753,
"eval_steps_per_second": 5.4,
"step": 19000
},
{
"epoch": 2.036340852130326,
"grad_norm": 3.1010448932647705,
"learning_rate": 8.05e-05,
"loss": 1.109,
"step": 19500
},
{
"epoch": 2.036340852130326,
"eval_accuracy": 0.7807163576099158,
"eval_loss": 1.1260879039764404,
"eval_runtime": 367.0489,
"eval_samples_per_second": 172.846,
"eval_steps_per_second": 5.403,
"step": 19500
},
{
"epoch": 2.0885547201336676,
"grad_norm": 3.1995084285736084,
"learning_rate": 8e-05,
"loss": 1.1047,
"step": 20000
},
{
"epoch": 2.0885547201336676,
"eval_accuracy": 0.7806008208598338,
"eval_loss": 1.1227824687957764,
"eval_runtime": 367.4627,
"eval_samples_per_second": 172.652,
"eval_steps_per_second": 5.396,
"step": 20000
},
{
"epoch": 2.1407685881370093,
"grad_norm": 2.741629123687744,
"learning_rate": 7.950000000000001e-05,
"loss": 1.1042,
"step": 20500
},
{
"epoch": 2.1407685881370093,
"eval_accuracy": 0.7819552122122398,
"eval_loss": 1.1106891632080078,
"eval_runtime": 369.2375,
"eval_samples_per_second": 171.822,
"eval_steps_per_second": 5.371,
"step": 20500
},
{
"epoch": 2.192982456140351,
"grad_norm": 3.0432229042053223,
"learning_rate": 7.900000000000001e-05,
"loss": 1.1071,
"step": 21000
},
{
"epoch": 2.192982456140351,
"eval_accuracy": 0.7825761226202902,
"eval_loss": 1.105894923210144,
"eval_runtime": 368.3863,
"eval_samples_per_second": 172.219,
"eval_steps_per_second": 5.383,
"step": 21000
},
{
"epoch": 2.2451963241436927,
"grad_norm": 2.60355806350708,
"learning_rate": 7.850000000000001e-05,
"loss": 1.1,
"step": 21500
},
{
"epoch": 2.2451963241436927,
"eval_accuracy": 0.7824829036453502,
"eval_loss": 1.1067482233047485,
"eval_runtime": 369.3294,
"eval_samples_per_second": 171.779,
"eval_steps_per_second": 5.369,
"step": 21500
},
{
"epoch": 2.2974101921470345,
"grad_norm": 3.290499448776245,
"learning_rate": 7.800000000000001e-05,
"loss": 1.094,
"step": 22000
},
{
"epoch": 2.2974101921470345,
"eval_accuracy": 0.7835772317290801,
"eval_loss": 1.1064658164978027,
"eval_runtime": 371.2345,
"eval_samples_per_second": 170.897,
"eval_steps_per_second": 5.342,
"step": 22000
},
{
"epoch": 2.3496240601503757,
"grad_norm": 2.749668598175049,
"learning_rate": 7.75e-05,
"loss": 1.0868,
"step": 22500
},
{
"epoch": 2.3496240601503757,
"eval_accuracy": 0.7844280402341692,
"eval_loss": 1.1136114597320557,
"eval_runtime": 370.2159,
"eval_samples_per_second": 171.368,
"eval_steps_per_second": 5.356,
"step": 22500
},
{
"epoch": 2.4018379281537174,
"grad_norm": 2.553410768508911,
"learning_rate": 7.7e-05,
"loss": 1.0938,
"step": 23000
},
{
"epoch": 2.4018379281537174,
"eval_accuracy": 0.7854044343476029,
"eval_loss": 1.0965579748153687,
"eval_runtime": 370.6834,
"eval_samples_per_second": 171.151,
"eval_steps_per_second": 5.35,
"step": 23000
},
{
"epoch": 2.454051796157059,
"grad_norm": 2.7348687648773193,
"learning_rate": 7.65e-05,
"loss": 1.0881,
"step": 23500
},
{
"epoch": 2.454051796157059,
"eval_accuracy": 0.7850364727467058,
"eval_loss": 1.1107020378112793,
"eval_runtime": 369.6648,
"eval_samples_per_second": 171.623,
"eval_steps_per_second": 5.364,
"step": 23500
},
{
"epoch": 2.506265664160401,
"grad_norm": 2.639407157897949,
"learning_rate": 7.6e-05,
"loss": 1.0885,
"step": 24000
},
{
"epoch": 2.506265664160401,
"eval_accuracy": 0.7867807742632953,
"eval_loss": 1.0890835523605347,
"eval_runtime": 369.0693,
"eval_samples_per_second": 171.9,
"eval_steps_per_second": 5.373,
"step": 24000
},
{
"epoch": 2.5584795321637426,
"grad_norm": 3.1823441982269287,
"learning_rate": 7.55e-05,
"loss": 1.077,
"step": 24500
},
{
"epoch": 2.5584795321637426,
"eval_accuracy": 0.7863467916817448,
"eval_loss": 1.0876108407974243,
"eval_runtime": 368.1359,
"eval_samples_per_second": 172.336,
"eval_steps_per_second": 5.387,
"step": 24500
},
{
"epoch": 2.6106934001670843,
"grad_norm": 2.6731505393981934,
"learning_rate": 7.500000000000001e-05,
"loss": 1.078,
"step": 25000
},
{
"epoch": 2.6106934001670843,
"eval_accuracy": 0.7872558672911878,
"eval_loss": 1.0856846570968628,
"eval_runtime": 367.5868,
"eval_samples_per_second": 172.593,
"eval_steps_per_second": 5.395,
"step": 25000
},
{
"epoch": 2.662907268170426,
"grad_norm": 2.8799052238464355,
"learning_rate": 7.450000000000001e-05,
"loss": 1.0759,
"step": 25500
},
{
"epoch": 2.662907268170426,
"eval_accuracy": 0.788363306685935,
"eval_loss": 1.080370545387268,
"eval_runtime": 368.0395,
"eval_samples_per_second": 172.381,
"eval_steps_per_second": 5.388,
"step": 25500
},
{
"epoch": 2.7151211361737677,
"grad_norm": 4.7506208419799805,
"learning_rate": 7.4e-05,
"loss": 1.072,
"step": 26000
},
{
"epoch": 2.7151211361737677,
"eval_accuracy": 0.7885482778687157,
"eval_loss": 1.08516526222229,
"eval_runtime": 366.0953,
"eval_samples_per_second": 173.296,
"eval_steps_per_second": 5.417,
"step": 26000
},
{
"epoch": 2.7673350041771094,
"grad_norm": 2.411212682723999,
"learning_rate": 7.35e-05,
"loss": 1.0655,
"step": 26500
},
{
"epoch": 2.7673350041771094,
"eval_accuracy": 0.7889261779231521,
"eval_loss": 1.0773206949234009,
"eval_runtime": 366.4448,
"eval_samples_per_second": 173.131,
"eval_steps_per_second": 5.411,
"step": 26500
},
{
"epoch": 2.819548872180451,
"grad_norm": 2.6230037212371826,
"learning_rate": 7.3e-05,
"loss": 1.0747,
"step": 27000
},
{
"epoch": 2.819548872180451,
"eval_accuracy": 0.7891164707308205,
"eval_loss": 1.0783028602600098,
"eval_runtime": 367.8248,
"eval_samples_per_second": 172.482,
"eval_steps_per_second": 5.391,
"step": 27000
},
{
"epoch": 2.871762740183793,
"grad_norm": 2.9123287200927734,
"learning_rate": 7.25e-05,
"loss": 1.0691,
"step": 27500
},
{
"epoch": 2.871762740183793,
"eval_accuracy": 0.7904697432780537,
"eval_loss": 1.0683778524398804,
"eval_runtime": 365.814,
"eval_samples_per_second": 173.43,
"eval_steps_per_second": 5.421,
"step": 27500
},
{
"epoch": 2.9239766081871346,
"grad_norm": 2.716794967651367,
"learning_rate": 7.2e-05,
"loss": 1.0581,
"step": 28000
},
{
"epoch": 2.9239766081871346,
"eval_accuracy": 0.7907833141699024,
"eval_loss": 1.0698238611221313,
"eval_runtime": 368.4835,
"eval_samples_per_second": 172.173,
"eval_steps_per_second": 5.382,
"step": 28000
},
{
"epoch": 2.9761904761904763,
"grad_norm": 2.900315761566162,
"learning_rate": 7.15e-05,
"loss": 1.0576,
"step": 28500
},
{
"epoch": 2.9761904761904763,
"eval_accuracy": 0.7912189924393805,
"eval_loss": 1.0527312755584717,
"eval_runtime": 367.3342,
"eval_samples_per_second": 172.712,
"eval_steps_per_second": 5.398,
"step": 28500
},
{
"epoch": 3.028404344193818,
"grad_norm": 2.733102321624756,
"learning_rate": 7.1e-05,
"loss": 1.0491,
"step": 29000
},
{
"epoch": 3.028404344193818,
"eval_accuracy": 0.7912027755966868,
"eval_loss": 1.0649120807647705,
"eval_runtime": 367.5821,
"eval_samples_per_second": 172.595,
"eval_steps_per_second": 5.395,
"step": 29000
},
{
"epoch": 3.0806182121971597,
"grad_norm": 2.990572452545166,
"learning_rate": 7.05e-05,
"loss": 1.0429,
"step": 29500
},
{
"epoch": 3.0806182121971597,
"eval_accuracy": 0.791866931892446,
"eval_loss": 1.065590262413025,
"eval_runtime": 367.6583,
"eval_samples_per_second": 172.56,
"eval_steps_per_second": 5.394,
"step": 29500
},
{
"epoch": 3.1328320802005014,
"grad_norm": 2.6242377758026123,
"learning_rate": 7e-05,
"loss": 1.0416,
"step": 30000
},
{
"epoch": 3.1328320802005014,
"eval_accuracy": 0.793420728495659,
"eval_loss": 1.056522250175476,
"eval_runtime": 367.1286,
"eval_samples_per_second": 172.809,
"eval_steps_per_second": 5.401,
"step": 30000
},
{
"epoch": 3.185045948203843,
"grad_norm": 2.6089892387390137,
"learning_rate": 6.95e-05,
"loss": 1.0388,
"step": 30500
},
{
"epoch": 3.185045948203843,
"eval_accuracy": 0.792953820707681,
"eval_loss": 1.0602775812149048,
"eval_runtime": 367.5358,
"eval_samples_per_second": 172.617,
"eval_steps_per_second": 5.395,
"step": 30500
},
{
"epoch": 3.2372598162071844,
"grad_norm": 2.6940767765045166,
"learning_rate": 6.9e-05,
"loss": 1.0351,
"step": 31000
},
{
"epoch": 3.2372598162071844,
"eval_accuracy": 0.7933434972497504,
"eval_loss": 1.0466923713684082,
"eval_runtime": 367.3002,
"eval_samples_per_second": 172.728,
"eval_steps_per_second": 5.399,
"step": 31000
},
{
"epoch": 3.2894736842105265,
"grad_norm": 2.5024611949920654,
"learning_rate": 6.850000000000001e-05,
"loss": 1.0364,
"step": 31500
},
{
"epoch": 3.2894736842105265,
"eval_accuracy": 0.7938457165216277,
"eval_loss": 1.0515408515930176,
"eval_runtime": 367.3309,
"eval_samples_per_second": 172.713,
"eval_steps_per_second": 5.398,
"step": 31500
},
{
"epoch": 3.341687552213868,
"grad_norm": 2.594076156616211,
"learning_rate": 6.800000000000001e-05,
"loss": 1.0352,
"step": 32000
},
{
"epoch": 3.341687552213868,
"eval_accuracy": 0.7949145678709858,
"eval_loss": 1.0443620681762695,
"eval_runtime": 367.752,
"eval_samples_per_second": 172.516,
"eval_steps_per_second": 5.392,
"step": 32000
},
{
"epoch": 3.3939014202172095,
"grad_norm": 4.887377738952637,
"learning_rate": 6.750000000000001e-05,
"loss": 1.0269,
"step": 32500
},
{
"epoch": 3.3939014202172095,
"eval_accuracy": 0.7945826018068661,
"eval_loss": 1.0488784313201904,
"eval_runtime": 368.3885,
"eval_samples_per_second": 172.218,
"eval_steps_per_second": 5.383,
"step": 32500
},
{
"epoch": 3.4461152882205512,
"grad_norm": 3.1437246799468994,
"learning_rate": 6.7e-05,
"loss": 1.0293,
"step": 33000
},
{
"epoch": 3.4461152882205512,
"eval_accuracy": 0.795481144785727,
"eval_loss": 1.0364991426467896,
"eval_runtime": 366.6755,
"eval_samples_per_second": 173.022,
"eval_steps_per_second": 5.408,
"step": 33000
},
{
"epoch": 3.498329156223893,
"grad_norm": 2.490694761276245,
"learning_rate": 6.65e-05,
"loss": 1.0179,
"step": 33500
},
{
"epoch": 3.498329156223893,
"eval_accuracy": 0.7964686907750312,
"eval_loss": 1.043065071105957,
"eval_runtime": 367.4767,
"eval_samples_per_second": 172.645,
"eval_steps_per_second": 5.396,
"step": 33500
},
{
"epoch": 3.5505430242272347,
"grad_norm": 2.9056098461151123,
"learning_rate": 6.6e-05,
"loss": 1.0147,
"step": 34000
},
{
"epoch": 3.5505430242272347,
"eval_accuracy": 0.7962407550483477,
"eval_loss": NaN,
"eval_runtime": 367.3507,
"eval_samples_per_second": 172.704,
"eval_steps_per_second": 5.398,
"step": 34000
},
{
"epoch": 3.6027568922305764,
"grad_norm": 2.655351161956787,
"learning_rate": 6.55e-05,
"loss": 1.0112,
"step": 34500
},
{
"epoch": 3.6027568922305764,
"eval_accuracy": 0.7970424400295055,
"eval_loss": 1.033272624015808,
"eval_runtime": 367.4604,
"eval_samples_per_second": 172.653,
"eval_steps_per_second": 5.397,
"step": 34500
},
{
"epoch": 3.654970760233918,
"grad_norm": 2.922175168991089,
"learning_rate": 6.500000000000001e-05,
"loss": 1.0145,
"step": 35000
},
{
"epoch": 3.654970760233918,
"eval_accuracy": 0.7967277004610398,
"eval_loss": 1.0407856702804565,
"eval_runtime": 367.6311,
"eval_samples_per_second": 172.572,
"eval_steps_per_second": 5.394,
"step": 35000
},
{
"epoch": 3.70718462823726,
"grad_norm": 2.7691986560821533,
"learning_rate": 6.450000000000001e-05,
"loss": 1.0133,
"step": 35500
},
{
"epoch": 3.70718462823726,
"eval_accuracy": 0.7978057552609327,
"eval_loss": 1.0315524339675903,
"eval_runtime": 368.5127,
"eval_samples_per_second": 172.16,
"eval_steps_per_second": 5.381,
"step": 35500
},
{
"epoch": 3.7593984962406015,
"grad_norm": 2.718721389770508,
"learning_rate": 6.400000000000001e-05,
"loss": 1.0123,
"step": 36000
},
{
"epoch": 3.7593984962406015,
"eval_accuracy": 0.7984869599976302,
"eval_loss": 1.0284632444381714,
"eval_runtime": 367.5592,
"eval_samples_per_second": 172.606,
"eval_steps_per_second": 5.395,
"step": 36000
},
{
"epoch": 3.8116123642439432,
"grad_norm": 2.4457788467407227,
"learning_rate": 6.35e-05,
"loss": 1.008,
"step": 36500
},
{
"epoch": 3.8116123642439432,
"eval_accuracy": 0.7985738639575599,
"eval_loss": 1.0290788412094116,
"eval_runtime": 367.5585,
"eval_samples_per_second": 172.607,
"eval_steps_per_second": 5.395,
"step": 36500
},
{
"epoch": 3.863826232247285,
"grad_norm": 3.0719549655914307,
"learning_rate": 6.3e-05,
"loss": 0.9995,
"step": 37000
},
{
"epoch": 3.863826232247285,
"eval_accuracy": 0.7990341348986376,
"eval_loss": 1.0225682258605957,
"eval_runtime": 367.5861,
"eval_samples_per_second": 172.594,
"eval_steps_per_second": 5.395,
"step": 37000
},
{
"epoch": 3.9160401002506267,
"grad_norm": 2.502732038497925,
"learning_rate": 6.25e-05,
"loss": 1.0059,
"step": 37500
},
{
"epoch": 3.9160401002506267,
"eval_accuracy": 0.7990824767003194,
"eval_loss": 1.0225658416748047,
"eval_runtime": 368.4364,
"eval_samples_per_second": 172.195,
"eval_steps_per_second": 5.382,
"step": 37500
},
{
"epoch": 3.9682539682539684,
"grad_norm": 2.615722894668579,
"learning_rate": 6.2e-05,
"loss": 0.9908,
"step": 38000
},
{
"epoch": 3.9682539682539684,
"eval_accuracy": 0.8004074602540703,
"eval_loss": NaN,
"eval_runtime": 367.3504,
"eval_samples_per_second": 172.704,
"eval_steps_per_second": 5.398,
"step": 38000
},
{
"epoch": 4.02046783625731,
"grad_norm": 2.6223480701446533,
"learning_rate": 6.15e-05,
"loss": 1.0019,
"step": 38500
},
{
"epoch": 4.02046783625731,
"eval_accuracy": 0.8002175287186218,
"eval_loss": 1.0221267938613892,
"eval_runtime": 367.5886,
"eval_samples_per_second": 172.592,
"eval_steps_per_second": 5.395,
"step": 38500
},
{
"epoch": 4.072681704260652,
"grad_norm": 7.730719089508057,
"learning_rate": 6.1e-05,
"loss": 0.9859,
"step": 39000
},
{
"epoch": 4.072681704260652,
"eval_accuracy": 0.8004946972174642,
"eval_loss": 1.0225615501403809,
"eval_runtime": 367.5118,
"eval_samples_per_second": 172.628,
"eval_steps_per_second": 5.396,
"step": 39000
},
{
"epoch": 4.124895572263993,
"grad_norm": 2.6854565143585205,
"learning_rate": 6.05e-05,
"loss": 0.9899,
"step": 39500
},
{
"epoch": 4.124895572263993,
"eval_accuracy": 0.8006603707418583,
"eval_loss": 1.0182828903198242,
"eval_runtime": 367.2006,
"eval_samples_per_second": 172.775,
"eval_steps_per_second": 5.4,
"step": 39500
},
{
"epoch": 4.177109440267335,
"grad_norm": 2.871232509613037,
"learning_rate": 6e-05,
"loss": 0.9769,
"step": 40000
},
{
"epoch": 4.177109440267335,
"eval_accuracy": 0.8012399052534226,
"eval_loss": 1.007629632949829,
"eval_runtime": 367.698,
"eval_samples_per_second": 172.541,
"eval_steps_per_second": 5.393,
"step": 40000
},
{
"epoch": 4.2293233082706765,
"grad_norm": 2.7055728435516357,
"learning_rate": 5.95e-05,
"loss": 0.9862,
"step": 40500
},
{
"epoch": 4.2293233082706765,
"eval_accuracy": 0.8023664324313652,
"eval_loss": 1.0073480606079102,
"eval_runtime": 367.5166,
"eval_samples_per_second": 172.626,
"eval_steps_per_second": 5.396,
"step": 40500
},
{
"epoch": 4.281537176274019,
"grad_norm": 2.628458261489868,
"learning_rate": 5.9e-05,
"loss": 0.9784,
"step": 41000
},
{
"epoch": 4.281537176274019,
"eval_accuracy": 0.8022199523230344,
"eval_loss": 1.0004937648773193,
"eval_runtime": 368.0641,
"eval_samples_per_second": 172.369,
"eval_steps_per_second": 5.388,
"step": 41000
},
{
"epoch": 4.33375104427736,
"grad_norm": 2.5863466262817383,
"learning_rate": 5.85e-05,
"loss": 0.974,
"step": 41500
},
{
"epoch": 4.33375104427736,
"eval_accuracy": 0.8025649357830319,
"eval_loss": 1.0010262727737427,
"eval_runtime": 367.7203,
"eval_samples_per_second": 172.531,
"eval_steps_per_second": 5.393,
"step": 41500
},
{
"epoch": 4.385964912280702,
"grad_norm": 2.748701333999634,
"learning_rate": 5.8e-05,
"loss": 0.9828,
"step": 42000
},
{
"epoch": 4.385964912280702,
"eval_accuracy": 0.8030370401871146,
"eval_loss": 0.998631477355957,
"eval_runtime": 367.8751,
"eval_samples_per_second": 172.458,
"eval_steps_per_second": 5.39,
"step": 42000
},
{
"epoch": 4.438178780284043,
"grad_norm": 2.7004311084747314,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.9863,
"step": 42500
},
{
"epoch": 4.438178780284043,
"eval_accuracy": 0.8032171565331311,
"eval_loss": 1.0026530027389526,
"eval_runtime": 367.6912,
"eval_samples_per_second": 172.544,
"eval_steps_per_second": 5.393,
"step": 42500
},
{
"epoch": 4.4903926482873855,
"grad_norm": 2.4777750968933105,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.9725,
"step": 43000
},
{
"epoch": 4.4903926482873855,
"eval_accuracy": 0.8036803447902678,
"eval_loss": 0.9949015378952026,
"eval_runtime": 367.6755,
"eval_samples_per_second": 172.552,
"eval_steps_per_second": 5.393,
"step": 43000
},
{
"epoch": 4.542606516290727,
"grad_norm": 2.6259942054748535,
"learning_rate": 5.65e-05,
"loss": 0.9685,
"step": 43500
},
{
"epoch": 4.542606516290727,
"eval_accuracy": 0.8042554828282609,
"eval_loss": 0.9936387538909912,
"eval_runtime": 368.1053,
"eval_samples_per_second": 172.35,
"eval_steps_per_second": 5.387,
"step": 43500
},
{
"epoch": 4.594820384294069,
"grad_norm": 3.246826648712158,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.9736,
"step": 44000
},
{
"epoch": 4.594820384294069,
"eval_accuracy": 0.8043477668789687,
"eval_loss": 0.9914335608482361,
"eval_runtime": 367.7482,
"eval_samples_per_second": 172.518,
"eval_steps_per_second": 5.392,
"step": 44000
},
{
"epoch": 4.64703425229741,
"grad_norm": 2.689743995666504,
"learning_rate": 5.550000000000001e-05,
"loss": 0.9695,
"step": 44500
},
{
"epoch": 4.64703425229741,
"eval_accuracy": 0.8044957424284033,
"eval_loss": 1.0012022256851196,
"eval_runtime": 367.8322,
"eval_samples_per_second": 172.478,
"eval_steps_per_second": 5.391,
"step": 44500
},
{
"epoch": 4.6992481203007515,
"grad_norm": 2.6002440452575684,
"learning_rate": 5.500000000000001e-05,
"loss": 0.9632,
"step": 45000
},
{
"epoch": 4.6992481203007515,
"eval_accuracy": 0.8053408897805091,
"eval_loss": 0.9931978583335876,
"eval_runtime": 367.8417,
"eval_samples_per_second": 172.474,
"eval_steps_per_second": 5.391,
"step": 45000
},
{
"epoch": 4.751461988304094,
"grad_norm": 2.720722198486328,
"learning_rate": 5.45e-05,
"loss": 0.9683,
"step": 45500
},
{
"epoch": 4.751461988304094,
"eval_accuracy": 0.8055148927829874,
"eval_loss": 0.9913645386695862,
"eval_runtime": 367.8141,
"eval_samples_per_second": 172.487,
"eval_steps_per_second": 5.391,
"step": 45500
},
{
"epoch": 4.803675856307435,
"grad_norm": 2.6984803676605225,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.9559,
"step": 46000
},
{
"epoch": 4.803675856307435,
"eval_accuracy": 0.8059522006843057,
"eval_loss": 0.9891652464866638,
"eval_runtime": 368.2454,
"eval_samples_per_second": 172.285,
"eval_steps_per_second": 5.385,
"step": 46000
},
{
"epoch": 4.855889724310777,
"grad_norm": 2.4419870376586914,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.9592,
"step": 46500
},
{
"epoch": 4.855889724310777,
"eval_accuracy": 0.805452993603716,
"eval_loss": 0.9957113862037659,
"eval_runtime": 368.2591,
"eval_samples_per_second": 172.278,
"eval_steps_per_second": 5.385,
"step": 46500
},
{
"epoch": 4.908103592314118,
"grad_norm": 2.915653944015503,
"learning_rate": 5.300000000000001e-05,
"loss": 0.9593,
"step": 47000
},
{
"epoch": 4.908103592314118,
"eval_accuracy": 0.8068829182930286,
"eval_loss": 0.987449049949646,
"eval_runtime": 367.9346,
"eval_samples_per_second": 172.43,
"eval_steps_per_second": 5.39,
"step": 47000
},
{
"epoch": 4.9603174603174605,
"grad_norm": 2.3721494674682617,
"learning_rate": 5.25e-05,
"loss": 0.9521,
"step": 47500
},
{
"epoch": 4.9603174603174605,
"eval_accuracy": 0.8072047578358524,
"eval_loss": 0.9854400157928467,
"eval_runtime": 368.1441,
"eval_samples_per_second": 172.332,
"eval_steps_per_second": 5.386,
"step": 47500
},
{
"epoch": 5.012531328320802,
"grad_norm": 2.7081363201141357,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.9516,
"step": 48000
},
{
"epoch": 5.012531328320802,
"eval_accuracy": 0.8074420340361171,
"eval_loss": 0.9737870097160339,
"eval_runtime": 367.5696,
"eval_samples_per_second": 172.601,
"eval_steps_per_second": 5.395,
"step": 48000
},
{
"epoch": 5.064745196324144,
"grad_norm": 2.834805965423584,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.9361,
"step": 48500
},
{
"epoch": 5.064745196324144,
"eval_accuracy": 0.807780889243512,
"eval_loss": NaN,
"eval_runtime": 366.5337,
"eval_samples_per_second": 173.089,
"eval_steps_per_second": 5.41,
"step": 48500
},
{
"epoch": 5.116959064327485,
"grad_norm": 2.5488121509552,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.9472,
"step": 49000
},
{
"epoch": 5.116959064327485,
"eval_accuracy": 0.8083244761680142,
"eval_loss": 0.9714484214782715,
"eval_runtime": 364.0778,
"eval_samples_per_second": 174.257,
"eval_steps_per_second": 5.447,
"step": 49000
},
{
"epoch": 5.169172932330827,
"grad_norm": 2.5738179683685303,
"learning_rate": 5.05e-05,
"loss": 0.9387,
"step": 49500
},
{
"epoch": 5.169172932330827,
"eval_accuracy": 0.8083237627586395,
"eval_loss": NaN,
"eval_runtime": 366.0302,
"eval_samples_per_second": 173.327,
"eval_steps_per_second": 5.418,
"step": 49500
},
{
"epoch": 5.221386800334169,
"grad_norm": 2.578078508377075,
"learning_rate": 5e-05,
"loss": 0.9405,
"step": 50000
},
{
"epoch": 5.221386800334169,
"eval_accuracy": 0.8092523232767513,
"eval_loss": 0.9723265767097473,
"eval_runtime": 365.9623,
"eval_samples_per_second": 173.359,
"eval_steps_per_second": 5.419,
"step": 50000
},
{
"epoch": 5.273600668337511,
"grad_norm": 3.3929808139801025,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.9386,
"step": 50500
},
{
"epoch": 5.273600668337511,
"eval_accuracy": 0.8091146550523858,
"eval_loss": 0.9661393761634827,
"eval_runtime": 365.1784,
"eval_samples_per_second": 173.732,
"eval_steps_per_second": 5.43,
"step": 50500
},
{
"epoch": 5.325814536340852,
"grad_norm": 2.3634896278381348,
"learning_rate": 4.9e-05,
"loss": 0.933,
"step": 51000
},
{
"epoch": 5.325814536340852,
"eval_accuracy": 0.808835748295871,
"eval_loss": 0.9813971519470215,
"eval_runtime": 381.116,
"eval_samples_per_second": 166.466,
"eval_steps_per_second": 5.203,
"step": 51000
},
{
"epoch": 5.378028404344194,
"grad_norm": 2.9042844772338867,
"learning_rate": 4.85e-05,
"loss": 0.9295,
"step": 51500
},
{
"epoch": 5.378028404344194,
"eval_accuracy": 0.8103665776447174,
"eval_loss": 0.9754624366760254,
"eval_runtime": 396.2254,
"eval_samples_per_second": 160.118,
"eval_steps_per_second": 5.005,
"step": 51500
},
{
"epoch": 5.430242272347535,
"grad_norm": 2.7546634674072266,
"learning_rate": 4.8e-05,
"loss": 0.9367,
"step": 52000
},
{
"epoch": 5.430242272347535,
"eval_accuracy": 0.8098531380075188,
"eval_loss": 0.9821533560752869,
"eval_runtime": 400.7427,
"eval_samples_per_second": 158.314,
"eval_steps_per_second": 4.948,
"step": 52000
},
{
"epoch": 5.482456140350878,
"grad_norm": 2.8839492797851562,
"learning_rate": 4.75e-05,
"loss": 0.9292,
"step": 52500
},
{
"epoch": 5.482456140350878,
"eval_accuracy": 0.8099045387069566,
"eval_loss": 0.974047064781189,
"eval_runtime": 399.7171,
"eval_samples_per_second": 158.72,
"eval_steps_per_second": 4.961,
"step": 52500
},
{
"epoch": 5.534670008354219,
"grad_norm": 2.3632848262786865,
"learning_rate": 4.7e-05,
"loss": 0.9217,
"step": 53000
},
{
"epoch": 5.534670008354219,
"eval_accuracy": 0.8100555138007703,
"eval_loss": 0.9726797938346863,
"eval_runtime": 398.2666,
"eval_samples_per_second": 159.298,
"eval_steps_per_second": 4.979,
"step": 53000
},
{
"epoch": 5.586883876357561,
"grad_norm": 3.127169609069824,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.9294,
"step": 53500
},
{
"epoch": 5.586883876357561,
"eval_accuracy": 0.8108817419721464,
"eval_loss": 0.9585431218147278,
"eval_runtime": 399.6498,
"eval_samples_per_second": 158.746,
"eval_steps_per_second": 4.962,
"step": 53500
},
{
"epoch": 5.639097744360902,
"grad_norm": 2.53694748878479,
"learning_rate": 4.600000000000001e-05,
"loss": 0.9195,
"step": 54000
},
{
"epoch": 5.639097744360902,
"eval_accuracy": 0.8119484709549883,
"eval_loss": 0.957590639591217,
"eval_runtime": 394.1783,
"eval_samples_per_second": 160.95,
"eval_steps_per_second": 5.031,
"step": 54000
},
{
"epoch": 5.6913116123642435,
"grad_norm": 2.6188199520111084,
"learning_rate": 4.55e-05,
"loss": 0.9233,
"step": 54500
},
{
"epoch": 5.6913116123642435,
"eval_accuracy": 0.8120400051070349,
"eval_loss": 0.9545953273773193,
"eval_runtime": 395.9182,
"eval_samples_per_second": 160.243,
"eval_steps_per_second": 5.009,
"step": 54500
},
{
"epoch": 5.743525480367586,
"grad_norm": 2.502518653869629,
"learning_rate": 4.5e-05,
"loss": 0.9272,
"step": 55000
},
{
"epoch": 5.743525480367586,
"eval_accuracy": 0.8122659898359591,
"eval_loss": 0.95594322681427,
"eval_runtime": 396.2822,
"eval_samples_per_second": 160.096,
"eval_steps_per_second": 5.004,
"step": 55000
},
{
"epoch": 5.795739348370927,
"grad_norm": 2.527303457260132,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.9163,
"step": 55500
},
{
"epoch": 5.795739348370927,
"eval_accuracy": 0.8124216614631093,
"eval_loss": 0.9586062431335449,
"eval_runtime": 402.0755,
"eval_samples_per_second": 157.789,
"eval_steps_per_second": 4.932,
"step": 55500
},
{
"epoch": 5.847953216374269,
"grad_norm": 2.5056955814361572,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.9249,
"step": 56000
},
{
"epoch": 5.847953216374269,
"eval_accuracy": 0.8123958167743631,
"eval_loss": 0.9501298069953918,
"eval_runtime": 397.3012,
"eval_samples_per_second": 159.685,
"eval_steps_per_second": 4.991,
"step": 56000
},
{
"epoch": 5.90016708437761,
"grad_norm": 2.876765727996826,
"learning_rate": 4.35e-05,
"loss": 0.9167,
"step": 56500
},
{
"epoch": 5.90016708437761,
"eval_accuracy": 0.8123634219518265,
"eval_loss": 0.958740770816803,
"eval_runtime": 393.8274,
"eval_samples_per_second": 161.093,
"eval_steps_per_second": 5.035,
"step": 56500
},
{
"epoch": 5.9523809523809526,
"grad_norm": 2.457308769226074,
"learning_rate": 4.3e-05,
"loss": 0.9191,
"step": 57000
},
{
"epoch": 5.9523809523809526,
"eval_accuracy": 0.8134356064347583,
"eval_loss": 0.9526042938232422,
"eval_runtime": 393.3254,
"eval_samples_per_second": 161.299,
"eval_steps_per_second": 5.042,
"step": 57000
},
{
"epoch": 6.004594820384294,
"grad_norm": 2.489978075027466,
"learning_rate": 4.25e-05,
"loss": 0.9176,
"step": 57500
},
{
"epoch": 6.004594820384294,
"eval_accuracy": 0.8137190312134606,
"eval_loss": 0.9511122703552246,
"eval_runtime": 398.7282,
"eval_samples_per_second": 159.113,
"eval_steps_per_second": 4.973,
"step": 57500
},
{
"epoch": 6.056808688387636,
"grad_norm": 2.7550694942474365,
"learning_rate": 4.2e-05,
"loss": 0.9034,
"step": 58000
},
{
"epoch": 6.056808688387636,
"eval_accuracy": 0.8137725525224172,
"eval_loss": 0.9519580006599426,
"eval_runtime": 397.3745,
"eval_samples_per_second": 159.655,
"eval_steps_per_second": 4.99,
"step": 58000
},
{
"epoch": 6.109022556390977,
"grad_norm": 2.820486068725586,
"learning_rate": 4.15e-05,
"loss": 0.905,
"step": 58500
},
{
"epoch": 6.109022556390977,
"eval_accuracy": 0.8139618722177003,
"eval_loss": 0.9513248801231384,
"eval_runtime": 401.2273,
"eval_samples_per_second": 158.122,
"eval_steps_per_second": 4.942,
"step": 58500
},
{
"epoch": 6.161236424394319,
"grad_norm": 2.634119749069214,
"learning_rate": 4.1e-05,
"loss": 0.902,
"step": 59000
},
{
"epoch": 6.161236424394319,
"eval_accuracy": 0.8135963134645056,
"eval_loss": 0.9465650916099548,
"eval_runtime": 402.0909,
"eval_samples_per_second": 157.783,
"eval_steps_per_second": 4.932,
"step": 59000
},
{
"epoch": 6.213450292397661,
"grad_norm": 2.8152124881744385,
"learning_rate": 4.05e-05,
"loss": 0.9016,
"step": 59500
},
{
"epoch": 6.213450292397661,
"eval_accuracy": 0.8144102857826985,
"eval_loss": 0.9431130886077881,
"eval_runtime": 399.633,
"eval_samples_per_second": 158.753,
"eval_steps_per_second": 4.962,
"step": 59500
},
{
"epoch": 6.265664160401003,
"grad_norm": 2.478999137878418,
"learning_rate": 4e-05,
"loss": 0.9018,
"step": 60000
},
{
"epoch": 6.265664160401003,
"eval_accuracy": 0.8148281916671716,
"eval_loss": 0.9465348124504089,
"eval_runtime": 401.047,
"eval_samples_per_second": 158.193,
"eval_steps_per_second": 4.945,
"step": 60000
},
{
"epoch": 6.317878028404344,
"grad_norm": 2.4472334384918213,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.8894,
"step": 60500
},
{
"epoch": 6.317878028404344,
"eval_accuracy": 0.8152884197022962,
"eval_loss": 0.9336789846420288,
"eval_runtime": 399.7265,
"eval_samples_per_second": 158.716,
"eval_steps_per_second": 4.961,
"step": 60500
},
{
"epoch": 6.370091896407686,
"grad_norm": 2.6032795906066895,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.8861,
"step": 61000
},
{
"epoch": 6.370091896407686,
"eval_accuracy": 0.8158878972595904,
"eval_loss": 0.9414376616477966,
"eval_runtime": 403.8156,
"eval_samples_per_second": 157.109,
"eval_steps_per_second": 4.911,
"step": 61000
},
{
"epoch": 6.4223057644110275,
"grad_norm": 2.5782527923583984,
"learning_rate": 3.85e-05,
"loss": 0.8929,
"step": 61500
},
{
"epoch": 6.4223057644110275,
"eval_accuracy": 0.8158353407843104,
"eval_loss": 0.9369956254959106,
"eval_runtime": 403.5486,
"eval_samples_per_second": 157.213,
"eval_steps_per_second": 4.914,
"step": 61500
},
{
"epoch": 6.474519632414369,
"grad_norm": 2.3801369667053223,
"learning_rate": 3.8e-05,
"loss": 0.8869,
"step": 62000
},
{
"epoch": 6.474519632414369,
"eval_accuracy": 0.8156502533666622,
"eval_loss": 0.9334683418273926,
"eval_runtime": 401.7901,
"eval_samples_per_second": 157.901,
"eval_steps_per_second": 4.935,
"step": 62000
},
{
"epoch": 6.526733500417711,
"grad_norm": 2.44163179397583,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.8933,
"step": 62500
},
{
"epoch": 6.526733500417711,
"eval_accuracy": 0.8168851117616432,
"eval_loss": 0.9306558966636658,
"eval_runtime": 397.9786,
"eval_samples_per_second": 159.413,
"eval_steps_per_second": 4.983,
"step": 62500
},
{
"epoch": 6.578947368421053,
"grad_norm": 2.2924203872680664,
"learning_rate": 3.7e-05,
"loss": 0.8862,
"step": 63000
},
{
"epoch": 6.578947368421053,
"eval_accuracy": 0.8162968497056878,
"eval_loss": 0.9353703856468201,
"eval_runtime": 394.7488,
"eval_samples_per_second": 160.717,
"eval_steps_per_second": 5.023,
"step": 63000
},
{
"epoch": 6.631161236424394,
"grad_norm": 2.552828550338745,
"learning_rate": 3.65e-05,
"loss": 0.8948,
"step": 63500
},
{
"epoch": 6.631161236424394,
"eval_accuracy": 0.8177748484660532,
"eval_loss": 0.9219902157783508,
"eval_runtime": 393.8065,
"eval_samples_per_second": 161.102,
"eval_steps_per_second": 5.035,
"step": 63500
},
{
"epoch": 6.683375104427736,
"grad_norm": 2.423050880432129,
"learning_rate": 3.6e-05,
"loss": 0.8889,
"step": 64000
},
{
"epoch": 6.683375104427736,
"eval_accuracy": 0.8173146779439671,
"eval_loss": 0.9342746138572693,
"eval_runtime": 393.0912,
"eval_samples_per_second": 161.395,
"eval_steps_per_second": 5.045,
"step": 64000
},
{
"epoch": 6.735588972431078,
"grad_norm": 2.796268939971924,
"learning_rate": 3.55e-05,
"loss": 0.8885,
"step": 64500
},
{
"epoch": 6.735588972431078,
"eval_accuracy": 0.817551689369001,
"eval_loss": 0.9292306303977966,
"eval_runtime": 394.1087,
"eval_samples_per_second": 160.978,
"eval_steps_per_second": 5.032,
"step": 64500
},
{
"epoch": 6.787802840434419,
"grad_norm": 2.8128762245178223,
"learning_rate": 3.5e-05,
"loss": 0.8854,
"step": 65000
},
{
"epoch": 6.787802840434419,
"eval_accuracy": 0.8176226440260885,
"eval_loss": 0.9265376925468445,
"eval_runtime": 394.2997,
"eval_samples_per_second": 160.9,
"eval_steps_per_second": 5.029,
"step": 65000
},
{
"epoch": 6.840016708437761,
"grad_norm": 2.8697590827941895,
"learning_rate": 3.45e-05,
"loss": 0.88,
"step": 65500
},
{
"epoch": 6.840016708437761,
"eval_accuracy": 0.8185052265994757,
"eval_loss": 0.928463876247406,
"eval_runtime": 394.323,
"eval_samples_per_second": 160.891,
"eval_steps_per_second": 5.029,
"step": 65500
},
{
"epoch": 6.8922305764411025,
"grad_norm": 2.6351752281188965,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.8841,
"step": 66000
},
{
"epoch": 6.8922305764411025,
"eval_accuracy": 0.8183555493481675,
"eval_loss": 0.9194802045822144,
"eval_runtime": 400.5152,
"eval_samples_per_second": 158.403,
"eval_steps_per_second": 4.951,
"step": 66000
},
{
"epoch": 6.944444444444445,
"grad_norm": 2.2477810382843018,
"learning_rate": 3.35e-05,
"loss": 0.8807,
"step": 66500
},
{
"epoch": 6.944444444444445,
"eval_accuracy": 0.8187747090050445,
"eval_loss": 0.9138378500938416,
"eval_runtime": 394.4731,
"eval_samples_per_second": 160.83,
"eval_steps_per_second": 5.027,
"step": 66500
},
{
"epoch": 6.996658312447786,
"grad_norm": 2.1748201847076416,
"learning_rate": 3.3e-05,
"loss": 0.8836,
"step": 67000
},
{
"epoch": 6.996658312447786,
"eval_accuracy": 0.8190003355856446,
"eval_loss": 0.9168543815612793,
"eval_runtime": 394.6502,
"eval_samples_per_second": 160.758,
"eval_steps_per_second": 5.025,
"step": 67000
},
{
"epoch": 7.048872180451128,
"grad_norm": 2.7435665130615234,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.8707,
"step": 67500
},
{
"epoch": 7.048872180451128,
"eval_accuracy": 0.8191917867246192,
"eval_loss": 0.9231117367744446,
"eval_runtime": 392.3745,
"eval_samples_per_second": 161.69,
"eval_steps_per_second": 5.054,
"step": 67500
},
{
"epoch": 7.101086048454469,
"grad_norm": 2.569931745529175,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.8701,
"step": 68000
},
{
"epoch": 7.101086048454469,
"eval_accuracy": 0.819969121038317,
"eval_loss": 0.9153112173080444,
"eval_runtime": 393.0875,
"eval_samples_per_second": 161.397,
"eval_steps_per_second": 5.045,
"step": 68000
},
{
"epoch": 7.1532999164578115,
"grad_norm": 2.819972515106201,
"learning_rate": 3.15e-05,
"loss": 0.8656,
"step": 68500
},
{
"epoch": 7.1532999164578115,
"eval_accuracy": 0.8194268014568229,
"eval_loss": 0.9146177768707275,
"eval_runtime": 394.2167,
"eval_samples_per_second": 160.934,
"eval_steps_per_second": 5.03,
"step": 68500
},
{
"epoch": 7.205513784461153,
"grad_norm": 2.415208578109741,
"learning_rate": 3.1e-05,
"loss": 0.871,
"step": 69000
},
{
"epoch": 7.205513784461153,
"eval_accuracy": 0.8198532803574815,
"eval_loss": NaN,
"eval_runtime": 395.7273,
"eval_samples_per_second": 160.32,
"eval_steps_per_second": 5.011,
"step": 69000
},
{
"epoch": 7.257727652464495,
"grad_norm": 2.7900257110595703,
"learning_rate": 3.05e-05,
"loss": 0.8685,
"step": 69500
},
{
"epoch": 7.257727652464495,
"eval_accuracy": 0.8207157152692325,
"eval_loss": 0.9127238392829895,
"eval_runtime": 392.7333,
"eval_samples_per_second": 161.542,
"eval_steps_per_second": 5.049,
"step": 69500
},
{
"epoch": 7.309941520467836,
"grad_norm": 2.643833637237549,
"learning_rate": 3e-05,
"loss": 0.8623,
"step": 70000
},
{
"epoch": 7.309941520467836,
"eval_accuracy": 0.8202351749773661,
"eval_loss": 0.9171856045722961,
"eval_runtime": 393.097,
"eval_samples_per_second": 161.393,
"eval_steps_per_second": 5.045,
"step": 70000
},
{
"epoch": 7.362155388471178,
"grad_norm": 2.5037946701049805,
"learning_rate": 2.95e-05,
"loss": 0.8601,
"step": 70500
},
{
"epoch": 7.362155388471178,
"eval_accuracy": 0.8211525544791177,
"eval_loss": 0.903904139995575,
"eval_runtime": 394.3466,
"eval_samples_per_second": 160.881,
"eval_steps_per_second": 5.029,
"step": 70500
},
{
"epoch": 7.41436925647452,
"grad_norm": 2.7093377113342285,
"learning_rate": 2.9e-05,
"loss": 0.8573,
"step": 71000
},
{
"epoch": 7.41436925647452,
"eval_accuracy": 0.8214304681574243,
"eval_loss": 0.9036524891853333,
"eval_runtime": 393.5778,
"eval_samples_per_second": 161.196,
"eval_steps_per_second": 5.038,
"step": 71000
},
{
"epoch": 7.466583124477861,
"grad_norm": 2.760674476623535,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.8633,
"step": 71500
},
{
"epoch": 7.466583124477861,
"eval_accuracy": 0.8217283854693812,
"eval_loss": 0.9000177979469299,
"eval_runtime": 396.5628,
"eval_samples_per_second": 159.982,
"eval_steps_per_second": 5.0,
"step": 71500
},
{
"epoch": 7.518796992481203,
"grad_norm": 2.831023693084717,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.8576,
"step": 72000
},
{
"epoch": 7.518796992481203,
"eval_accuracy": 0.8211579127032714,
"eval_loss": 0.918953001499176,
"eval_runtime": 400.5863,
"eval_samples_per_second": 158.375,
"eval_steps_per_second": 4.95,
"step": 72000
},
{
"epoch": 7.571010860484545,
"grad_norm": 2.453390598297119,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.854,
"step": 72500
},
{
"epoch": 7.571010860484545,
"eval_accuracy": 0.8220418032437955,
"eval_loss": 0.9041558504104614,
"eval_runtime": 399.249,
"eval_samples_per_second": 158.906,
"eval_steps_per_second": 4.967,
"step": 72500
},
{
"epoch": 7.6232247284878865,
"grad_norm": 2.6029062271118164,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.8615,
"step": 73000
},
{
"epoch": 7.6232247284878865,
"eval_accuracy": 0.8223261586980396,
"eval_loss": 0.8994259834289551,
"eval_runtime": 387.0421,
"eval_samples_per_second": 163.918,
"eval_steps_per_second": 5.123,
"step": 73000
},
{
"epoch": 7.675438596491228,
"grad_norm": 2.3841817378997803,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.8632,
"step": 73500
},
{
"epoch": 7.675438596491228,
"eval_accuracy": 0.8218723618637837,
"eval_loss": 0.9080346822738647,
"eval_runtime": 368.6926,
"eval_samples_per_second": 172.076,
"eval_steps_per_second": 5.378,
"step": 73500
},
{
"epoch": 7.72765246449457,
"grad_norm": 3.8187203407287598,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.8519,
"step": 74000
},
{
"epoch": 7.72765246449457,
"eval_accuracy": 0.8223824804514319,
"eval_loss": 0.9071189761161804,
"eval_runtime": 367.458,
"eval_samples_per_second": 172.654,
"eval_steps_per_second": 5.397,
"step": 74000
},
{
"epoch": 7.779866332497911,
"grad_norm": 2.744349241256714,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.8569,
"step": 74500
},
{
"epoch": 7.779866332497911,
"eval_accuracy": 0.8229439133116854,
"eval_loss": 0.9003260731697083,
"eval_runtime": 369.188,
"eval_samples_per_second": 171.845,
"eval_steps_per_second": 5.371,
"step": 74500
},
{
"epoch": 7.832080200501253,
"grad_norm": 2.3172342777252197,
"learning_rate": 2.5e-05,
"loss": 0.8502,
"step": 75000
},
{
"epoch": 7.832080200501253,
"eval_accuracy": 0.8229715307093431,
"eval_loss": 0.9045436382293701,
"eval_runtime": 368.1168,
"eval_samples_per_second": 172.345,
"eval_steps_per_second": 5.387,
"step": 75000
},
{
"epoch": 7.884294068504595,
"grad_norm": 2.6108663082122803,
"learning_rate": 2.45e-05,
"loss": 0.8536,
"step": 75500
},
{
"epoch": 7.884294068504595,
"eval_accuracy": 0.8234089591247552,
"eval_loss": 0.9026762247085571,
"eval_runtime": 366.8391,
"eval_samples_per_second": 172.945,
"eval_steps_per_second": 5.406,
"step": 75500
},
{
"epoch": 7.936507936507937,
"grad_norm": 2.315197467803955,
"learning_rate": 2.4e-05,
"loss": 0.85,
"step": 76000
},
{
"epoch": 7.936507936507937,
"eval_accuracy": 0.8232359994242249,
"eval_loss": 0.8995440006256104,
"eval_runtime": 367.8582,
"eval_samples_per_second": 172.466,
"eval_steps_per_second": 5.391,
"step": 76000
},
{
"epoch": 7.988721804511278,
"grad_norm": 2.611060380935669,
"learning_rate": 2.35e-05,
"loss": 0.8453,
"step": 76500
},
{
"epoch": 7.988721804511278,
"eval_accuracy": 0.8234605237364002,
"eval_loss": 0.8912914395332336,
"eval_runtime": 367.6079,
"eval_samples_per_second": 172.583,
"eval_steps_per_second": 5.394,
"step": 76500
},
{
"epoch": 8.04093567251462,
"grad_norm": 2.4672865867614746,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.8294,
"step": 77000
},
{
"epoch": 8.04093567251462,
"eval_accuracy": 0.8244379262655704,
"eval_loss": 0.896998405456543,
"eval_runtime": 367.8054,
"eval_samples_per_second": 172.491,
"eval_steps_per_second": 5.391,
"step": 77000
},
{
"epoch": 8.093149540517961,
"grad_norm": 2.645519495010376,
"learning_rate": 2.25e-05,
"loss": 0.849,
"step": 77500
},
{
"epoch": 8.093149540517961,
"eval_accuracy": 0.8244201499571945,
"eval_loss": 0.8957926034927368,
"eval_runtime": 369.9508,
"eval_samples_per_second": 171.49,
"eval_steps_per_second": 5.36,
"step": 77500
},
{
"epoch": 8.145363408521304,
"grad_norm": 2.7964978218078613,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.8363,
"step": 78000
},
{
"epoch": 8.145363408521304,
"eval_accuracy": 0.8244232177017738,
"eval_loss": 0.8884155750274658,
"eval_runtime": 367.358,
"eval_samples_per_second": 172.701,
"eval_steps_per_second": 5.398,
"step": 78000
},
{
"epoch": 8.197577276524646,
"grad_norm": 2.48044490814209,
"learning_rate": 2.15e-05,
"loss": 0.8355,
"step": 78500
},
{
"epoch": 8.197577276524646,
"eval_accuracy": 0.82532828384256,
"eval_loss": 0.8786827325820923,
"eval_runtime": 366.707,
"eval_samples_per_second": 173.007,
"eval_steps_per_second": 5.408,
"step": 78500
},
{
"epoch": 8.249791144527986,
"grad_norm": 2.6931161880493164,
"learning_rate": 2.1e-05,
"loss": 0.8439,
"step": 79000
},
{
"epoch": 8.249791144527986,
"eval_accuracy": 0.8246382720646043,
"eval_loss": 0.8848527073860168,
"eval_runtime": 366.9894,
"eval_samples_per_second": 172.874,
"eval_steps_per_second": 5.403,
"step": 79000
},
{
"epoch": 8.302005012531328,
"grad_norm": 2.607593297958374,
"learning_rate": 2.05e-05,
"loss": 0.8378,
"step": 79500
},
{
"epoch": 8.302005012531328,
"eval_accuracy": 0.8252490751457086,
"eval_loss": NaN,
"eval_runtime": 366.5162,
"eval_samples_per_second": 173.097,
"eval_steps_per_second": 5.41,
"step": 79500
},
{
"epoch": 8.35421888053467,
"grad_norm": 2.4364349842071533,
"learning_rate": 2e-05,
"loss": 0.8322,
"step": 80000
},
{
"epoch": 8.35421888053467,
"eval_accuracy": 0.8252812537780041,
"eval_loss": NaN,
"eval_runtime": 367.8653,
"eval_samples_per_second": 172.463,
"eval_steps_per_second": 5.391,
"step": 80000
},
{
"epoch": 8.406432748538013,
"grad_norm": 2.541379928588867,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.8366,
"step": 80500
},
{
"epoch": 8.406432748538013,
"eval_accuracy": 0.8252309925385912,
"eval_loss": 0.9011866450309753,
"eval_runtime": 367.843,
"eval_samples_per_second": 172.473,
"eval_steps_per_second": 5.391,
"step": 80500
},
{
"epoch": 8.458646616541353,
"grad_norm": 2.3451640605926514,
"learning_rate": 1.9e-05,
"loss": 0.8313,
"step": 81000
},
{
"epoch": 8.458646616541353,
"eval_accuracy": 0.8259800217048817,
"eval_loss": 0.8837263584136963,
"eval_runtime": 367.6187,
"eval_samples_per_second": 172.578,
"eval_steps_per_second": 5.394,
"step": 81000
},
{
"epoch": 8.510860484544695,
"grad_norm": 2.6211562156677246,
"learning_rate": 1.85e-05,
"loss": 0.8289,
"step": 81500
},
{
"epoch": 8.510860484544695,
"eval_accuracy": 0.8257132907861338,
"eval_loss": 0.8861507773399353,
"eval_runtime": 367.3072,
"eval_samples_per_second": 172.725,
"eval_steps_per_second": 5.399,
"step": 81500
},
{
"epoch": 8.563074352548037,
"grad_norm": 2.1193125247955322,
"learning_rate": 1.8e-05,
"loss": 0.8337,
"step": 82000
},
{
"epoch": 8.563074352548037,
"eval_accuracy": 0.8263971078151469,
"eval_loss": 0.8711854815483093,
"eval_runtime": 366.8982,
"eval_samples_per_second": 172.917,
"eval_steps_per_second": 5.405,
"step": 82000
},
{
"epoch": 8.615288220551378,
"grad_norm": 2.5682766437530518,
"learning_rate": 1.75e-05,
"loss": 0.8207,
"step": 82500
},
{
"epoch": 8.615288220551378,
"eval_accuracy": 0.8263459457288101,
"eval_loss": 0.8814241886138916,
"eval_runtime": 366.9346,
"eval_samples_per_second": 172.9,
"eval_steps_per_second": 5.404,
"step": 82500
},
{
"epoch": 8.66750208855472,
"grad_norm": 2.460284471511841,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.8273,
"step": 83000
},
{
"epoch": 8.66750208855472,
"eval_accuracy": 0.8265486834185438,
"eval_loss": 0.8726556301116943,
"eval_runtime": 367.479,
"eval_samples_per_second": 172.644,
"eval_steps_per_second": 5.396,
"step": 83000
},
{
"epoch": 8.719715956558062,
"grad_norm": 2.519878625869751,
"learning_rate": 1.65e-05,
"loss": 0.8212,
"step": 83500
},
{
"epoch": 8.719715956558062,
"eval_accuracy": 0.8265830712467154,
"eval_loss": 0.8803038001060486,
"eval_runtime": 368.0421,
"eval_samples_per_second": 172.38,
"eval_steps_per_second": 5.388,
"step": 83500
},
{
"epoch": 8.771929824561404,
"grad_norm": 2.9779090881347656,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.8267,
"step": 84000
},
{
"epoch": 8.771929824561404,
"eval_accuracy": 0.8266352426703358,
"eval_loss": NaN,
"eval_runtime": 370.1684,
"eval_samples_per_second": 171.39,
"eval_steps_per_second": 5.357,
"step": 84000
},
{
"epoch": 8.824143692564745,
"grad_norm": 2.5584726333618164,
"learning_rate": 1.55e-05,
"loss": 0.8236,
"step": 84500
},
{
"epoch": 8.824143692564745,
"eval_accuracy": 0.8272979799699072,
"eval_loss": 0.8709605932235718,
"eval_runtime": 367.2687,
"eval_samples_per_second": 172.743,
"eval_steps_per_second": 5.399,
"step": 84500
},
{
"epoch": 8.876357560568087,
"grad_norm": 2.8752248287200928,
"learning_rate": 1.5e-05,
"loss": 0.8284,
"step": 85000
},
{
"epoch": 8.876357560568087,
"eval_accuracy": 0.8270593176674245,
"eval_loss": 0.8703105449676514,
"eval_runtime": 367.121,
"eval_samples_per_second": 172.812,
"eval_steps_per_second": 5.401,
"step": 85000
},
{
"epoch": 8.928571428571429,
"grad_norm": 2.441423177719116,
"learning_rate": 1.45e-05,
"loss": 0.8305,
"step": 85500
},
{
"epoch": 8.928571428571429,
"eval_accuracy": 0.8273055974821587,
"eval_loss": 0.8782740831375122,
"eval_runtime": 367.2074,
"eval_samples_per_second": 172.772,
"eval_steps_per_second": 5.4,
"step": 85500
},
{
"epoch": 8.980785296574771,
"grad_norm": 2.5627357959747314,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.824,
"step": 86000
},
{
"epoch": 8.980785296574771,
"eval_accuracy": 0.8278787560948533,
"eval_loss": 0.8767423033714294,
"eval_runtime": 367.8162,
"eval_samples_per_second": 172.486,
"eval_steps_per_second": 5.391,
"step": 86000
},
{
"epoch": 9.032999164578111,
"grad_norm": 2.9659039974212646,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.8111,
"step": 86500
},
{
"epoch": 9.032999164578111,
"eval_accuracy": 0.8281429418603345,
"eval_loss": NaN,
"eval_runtime": 365.8491,
"eval_samples_per_second": 173.413,
"eval_steps_per_second": 5.42,
"step": 86500
},
{
"epoch": 9.085213032581454,
"grad_norm": 2.6677966117858887,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.8114,
"step": 87000
},
{
"epoch": 9.085213032581454,
"eval_accuracy": 0.8280896401382127,
"eval_loss": 0.8696718215942383,
"eval_runtime": 368.939,
"eval_samples_per_second": 171.961,
"eval_steps_per_second": 5.375,
"step": 87000
},
{
"epoch": 9.137426900584796,
"grad_norm": 2.8159141540527344,
"learning_rate": 1.25e-05,
"loss": 0.8214,
"step": 87500
},
{
"epoch": 9.137426900584796,
"eval_accuracy": 0.8287721660611849,
"eval_loss": 0.865568995475769,
"eval_runtime": 367.5282,
"eval_samples_per_second": 172.621,
"eval_steps_per_second": 5.396,
"step": 87500
},
{
"epoch": 9.189640768588138,
"grad_norm": 2.44439959526062,
"learning_rate": 1.2e-05,
"loss": 0.8117,
"step": 88000
},
{
"epoch": 9.189640768588138,
"eval_accuracy": 0.8286521507089889,
"eval_loss": 0.8658538460731506,
"eval_runtime": 365.9254,
"eval_samples_per_second": 173.377,
"eval_steps_per_second": 5.419,
"step": 88000
},
{
"epoch": 9.241854636591478,
"grad_norm": 2.686903953552246,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.8176,
"step": 88500
},
{
"epoch": 9.241854636591478,
"eval_accuracy": 0.8286974413686677,
"eval_loss": 0.8724528551101685,
"eval_runtime": 366.9175,
"eval_samples_per_second": 172.908,
"eval_steps_per_second": 5.404,
"step": 88500
},
{
"epoch": 9.29406850459482,
"grad_norm": 2.9780402183532715,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.8098,
"step": 89000
},
{
"epoch": 9.29406850459482,
"eval_accuracy": 0.8288113105821936,
"eval_loss": 0.8693479895591736,
"eval_runtime": 366.5706,
"eval_samples_per_second": 173.072,
"eval_steps_per_second": 5.41,
"step": 89000
},
{
"epoch": 9.346282372598163,
"grad_norm": 2.3616678714752197,
"learning_rate": 1.05e-05,
"loss": 0.8101,
"step": 89500
},
{
"epoch": 9.346282372598163,
"eval_accuracy": 0.8288534544077049,
"eval_loss": 0.8676702380180359,
"eval_runtime": 367.6956,
"eval_samples_per_second": 172.542,
"eval_steps_per_second": 5.393,
"step": 89500
},
{
"epoch": 9.398496240601503,
"grad_norm": 2.4607014656066895,
"learning_rate": 1e-05,
"loss": 0.8074,
"step": 90000
},
{
"epoch": 9.398496240601503,
"eval_accuracy": 0.8293368416792034,
"eval_loss": 0.868602454662323,
"eval_runtime": 370.3428,
"eval_samples_per_second": 171.309,
"eval_steps_per_second": 5.354,
"step": 90000
},
{
"epoch": 9.450710108604845,
"grad_norm": 2.303865432739258,
"learning_rate": 9.5e-06,
"loss": 0.8075,
"step": 90500
},
{
"epoch": 9.450710108604845,
"eval_accuracy": 0.829557819292624,
"eval_loss": 0.8760720491409302,
"eval_runtime": 366.9515,
"eval_samples_per_second": 172.892,
"eval_steps_per_second": 5.404,
"step": 90500
},
{
"epoch": 9.502923976608187,
"grad_norm": 3.1800107955932617,
"learning_rate": 9e-06,
"loss": 0.813,
"step": 91000
},
{
"epoch": 9.502923976608187,
"eval_accuracy": 0.8293719470428488,
"eval_loss": 0.8725053668022156,
"eval_runtime": 366.7268,
"eval_samples_per_second": 172.998,
"eval_steps_per_second": 5.407,
"step": 91000
},
{
"epoch": 9.55513784461153,
"grad_norm": 2.895458698272705,
"learning_rate": 8.500000000000002e-06,
"loss": 0.8142,
"step": 91500
},
{
"epoch": 9.55513784461153,
"eval_accuracy": 0.8299493767535929,
"eval_loss": 0.8597660660743713,
"eval_runtime": 366.9591,
"eval_samples_per_second": 172.888,
"eval_steps_per_second": 5.404,
"step": 91500
},
{
"epoch": 9.60735171261487,
"grad_norm": 2.494898796081543,
"learning_rate": 8.000000000000001e-06,
"loss": 0.8028,
"step": 92000
},
{
"epoch": 9.60735171261487,
"eval_accuracy": 0.8296925250842329,
"eval_loss": 0.8663123250007629,
"eval_runtime": 366.9071,
"eval_samples_per_second": 172.913,
"eval_steps_per_second": 5.405,
"step": 92000
},
{
"epoch": 9.659565580618212,
"grad_norm": 2.6649532318115234,
"learning_rate": 7.5e-06,
"loss": 0.8051,
"step": 92500
},
{
"epoch": 9.659565580618212,
"eval_accuracy": 0.8297183234153672,
"eval_loss": 0.8609718084335327,
"eval_runtime": 369.3381,
"eval_samples_per_second": 171.775,
"eval_steps_per_second": 5.369,
"step": 92500
},
{
"epoch": 9.711779448621554,
"grad_norm": 2.708948850631714,
"learning_rate": 7.000000000000001e-06,
"loss": 0.7982,
"step": 93000
},
{
"epoch": 9.711779448621554,
"eval_accuracy": 0.8295886536989987,
"eval_loss": 0.8697434067726135,
"eval_runtime": 367.9632,
"eval_samples_per_second": 172.417,
"eval_steps_per_second": 5.389,
"step": 93000
},
{
"epoch": 9.763993316624896,
"grad_norm": 2.320215940475464,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.7985,
"step": 93500
},
{
"epoch": 9.763993316624896,
"eval_accuracy": 0.8301079266760504,
"eval_loss": 0.8655940890312195,
"eval_runtime": 366.2895,
"eval_samples_per_second": 173.205,
"eval_steps_per_second": 5.414,
"step": 93500
},
{
"epoch": 9.816207184628237,
"grad_norm": 2.776233673095703,
"learning_rate": 6e-06,
"loss": 0.8016,
"step": 94000
},
{
"epoch": 9.816207184628237,
"eval_accuracy": 0.8302324108540118,
"eval_loss": 0.8625151515007019,
"eval_runtime": 367.8541,
"eval_samples_per_second": 172.468,
"eval_steps_per_second": 5.391,
"step": 94000
},
{
"epoch": 9.868421052631579,
"grad_norm": 2.461749792098999,
"learning_rate": 5.500000000000001e-06,
"loss": 0.8044,
"step": 94500
},
{
"epoch": 9.868421052631579,
"eval_accuracy": 0.8303814560823992,
"eval_loss": NaN,
"eval_runtime": 366.9598,
"eval_samples_per_second": 172.888,
"eval_steps_per_second": 5.404,
"step": 94500
},
{
"epoch": 9.920634920634921,
"grad_norm": 2.611931085586548,
"learning_rate": 5e-06,
"loss": 0.8,
"step": 95000
},
{
"epoch": 9.920634920634921,
"eval_accuracy": 0.8306366115947627,
"eval_loss": 0.8623033165931702,
"eval_runtime": 368.1962,
"eval_samples_per_second": 172.308,
"eval_steps_per_second": 5.386,
"step": 95000
},
{
"epoch": 9.972848788638263,
"grad_norm": 2.500162363052368,
"learning_rate": 4.5e-06,
"loss": 0.7972,
"step": 95500
},
{
"epoch": 9.972848788638263,
"eval_accuracy": 0.8306005889087491,
"eval_loss": 0.8600591421127319,
"eval_runtime": 369.0733,
"eval_samples_per_second": 171.898,
"eval_steps_per_second": 5.373,
"step": 95500
},
{
"epoch": 10.025062656641603,
"grad_norm": 3.495044231414795,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7929,
"step": 96000
},
{
"epoch": 10.025062656641603,
"eval_accuracy": 0.8308683319034997,
"eval_loss": 0.8547109961509705,
"eval_runtime": 368.4222,
"eval_samples_per_second": 172.202,
"eval_steps_per_second": 5.382,
"step": 96000
},
{
"epoch": 10.077276524644946,
"grad_norm": 2.7070469856262207,
"learning_rate": 3.5000000000000004e-06,
"loss": 0.7903,
"step": 96500
},
{
"epoch": 10.077276524644946,
"eval_accuracy": 0.8307715292279126,
"eval_loss": 0.8600655198097229,
"eval_runtime": 366.9772,
"eval_samples_per_second": 172.88,
"eval_steps_per_second": 5.404,
"step": 96500
},
{
"epoch": 10.129490392648288,
"grad_norm": 3.4624171257019043,
"learning_rate": 3e-06,
"loss": 0.7898,
"step": 97000
},
{
"epoch": 10.129490392648288,
"eval_accuracy": 0.831032096060647,
"eval_loss": 0.853934109210968,
"eval_runtime": 368.4043,
"eval_samples_per_second": 172.21,
"eval_steps_per_second": 5.383,
"step": 97000
},
{
"epoch": 10.18170426065163,
"grad_norm": 2.6027026176452637,
"learning_rate": 2.5e-06,
"loss": 0.7994,
"step": 97500
},
{
"epoch": 10.18170426065163,
"eval_accuracy": 0.8308584971832438,
"eval_loss": 0.8530024886131287,
"eval_runtime": 366.5565,
"eval_samples_per_second": 173.078,
"eval_steps_per_second": 5.41,
"step": 97500
},
{
"epoch": 10.23391812865497,
"grad_norm": 2.5377743244171143,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7926,
"step": 98000
},
{
"epoch": 10.23391812865497,
"eval_accuracy": 0.8310957564926591,
"eval_loss": 0.8588021993637085,
"eval_runtime": 366.0793,
"eval_samples_per_second": 173.304,
"eval_steps_per_second": 5.417,
"step": 98000
},
{
"epoch": 10.286131996658312,
"grad_norm": 2.9179139137268066,
"learning_rate": 1.5e-06,
"loss": 0.785,
"step": 98500
},
{
"epoch": 10.286131996658312,
"eval_accuracy": 0.8314594660073593,
"eval_loss": 0.8528650403022766,
"eval_runtime": 366.8402,
"eval_samples_per_second": 172.945,
"eval_steps_per_second": 5.406,
"step": 98500
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 11,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.304812876669911e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}