{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 10167, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014753614635585719, "grad_norm": 2.403060047303882, "learning_rate": 1.9665683382497543e-07, "loss": 1.0717, "num_tokens": 713056.0, "step": 5 }, { "epoch": 0.0029507229271171437, "grad_norm": 2.4004404184789543, "learning_rate": 4.424778761061947e-07, "loss": 1.0679, "num_tokens": 1489366.0, "step": 10 }, { "epoch": 0.004426084390675715, "grad_norm": 1.8229712325896212, "learning_rate": 6.882989183874141e-07, "loss": 1.0605, "num_tokens": 2321458.0, "step": 15 }, { "epoch": 0.005901445854234287, "grad_norm": 1.074101294268914, "learning_rate": 9.341199606686334e-07, "loss": 1.0478, "num_tokens": 3159759.0, "step": 20 }, { "epoch": 0.007376807317792859, "grad_norm": 1.0859473283690408, "learning_rate": 1.1799410029498526e-06, "loss": 1.0557, "num_tokens": 3928301.0, "step": 25 }, { "epoch": 0.00885216878135143, "grad_norm": 1.0915031319507078, "learning_rate": 1.4257620452310717e-06, "loss": 1.0077, "num_tokens": 4659355.0, "step": 30 }, { "epoch": 0.010327530244910003, "grad_norm": 0.8558544943583027, "learning_rate": 1.671583087512291e-06, "loss": 1.0158, "num_tokens": 5458828.0, "step": 35 }, { "epoch": 0.011802891708468575, "grad_norm": 0.6920693195159647, "learning_rate": 1.9174041297935103e-06, "loss": 0.9751, "num_tokens": 6204845.0, "step": 40 }, { "epoch": 0.013278253172027146, "grad_norm": 0.5476048570739727, "learning_rate": 2.1632251720747296e-06, "loss": 0.9483, "num_tokens": 6966149.0, "step": 45 }, { "epoch": 0.014753614635585718, "grad_norm": 0.5236812226373362, "learning_rate": 2.409046214355949e-06, "loss": 0.9494, "num_tokens": 7723950.0, "step": 50 }, { "epoch": 0.01622897609914429, "grad_norm": 0.4534221442911254, "learning_rate": 2.6548672566371683e-06, "loss": 0.8999, "num_tokens": 8387282.0, "step": 55 }, { "epoch": 0.01770433756270286, "grad_norm": 0.48135851841288757, "learning_rate": 2.9006882989183876e-06, "loss": 0.8976, "num_tokens": 9186724.0, "step": 60 }, { "epoch": 0.019179699026261435, "grad_norm": 0.4308986582797722, "learning_rate": 3.146509341199607e-06, "loss": 0.8845, "num_tokens": 9922464.0, "step": 65 }, { "epoch": 0.020655060489820007, "grad_norm": 0.37093240872206834, "learning_rate": 3.392330383480826e-06, "loss": 0.8542, "num_tokens": 10693481.0, "step": 70 }, { "epoch": 0.02213042195337858, "grad_norm": 0.461443833724279, "learning_rate": 3.638151425762045e-06, "loss": 0.8705, "num_tokens": 11451626.0, "step": 75 }, { "epoch": 0.02360578341693715, "grad_norm": 0.4002699606586522, "learning_rate": 3.883972468043265e-06, "loss": 0.8498, "num_tokens": 12245338.0, "step": 80 }, { "epoch": 0.02508114488049572, "grad_norm": 0.44093872129057715, "learning_rate": 4.129793510324484e-06, "loss": 0.8333, "num_tokens": 12968377.0, "step": 85 }, { "epoch": 0.026556506344054293, "grad_norm": 0.3729097926483993, "learning_rate": 4.375614552605703e-06, "loss": 0.844, "num_tokens": 13795568.0, "step": 90 }, { "epoch": 0.028031867807612864, "grad_norm": 0.376644464078817, "learning_rate": 4.621435594886923e-06, "loss": 0.8103, "num_tokens": 14535576.0, "step": 95 }, { "epoch": 0.029507229271171435, "grad_norm": 0.3952543675792905, "learning_rate": 4.867256637168142e-06, "loss": 0.8094, "num_tokens": 15256192.0, "step": 100 }, { "epoch": 0.03098259073473001, "grad_norm": 0.411779472812937, "learning_rate": 5.1130776794493605e-06, "loss": 0.8049, "num_tokens": 15955458.0, "step": 105 }, { "epoch": 0.03245795219828858, "grad_norm": 0.45828144858973324, "learning_rate": 5.358898721730581e-06, "loss": 0.7883, "num_tokens": 16725133.0, "step": 110 }, { "epoch": 0.03393331366184715, "grad_norm": 0.3902231855286409, "learning_rate": 5.6047197640118e-06, "loss": 0.8128, "num_tokens": 17490154.0, "step": 115 }, { "epoch": 0.03540867512540572, "grad_norm": 0.41181168011284996, "learning_rate": 5.8505408062930185e-06, "loss": 0.7935, "num_tokens": 18191127.0, "step": 120 }, { "epoch": 0.036884036588964296, "grad_norm": 0.3841164546071141, "learning_rate": 6.096361848574239e-06, "loss": 0.8041, "num_tokens": 18964748.0, "step": 125 }, { "epoch": 0.03835939805252287, "grad_norm": 0.3801972080126575, "learning_rate": 6.342182890855458e-06, "loss": 0.7852, "num_tokens": 19718493.0, "step": 130 }, { "epoch": 0.03983475951608144, "grad_norm": 0.43627923289735115, "learning_rate": 6.588003933136677e-06, "loss": 0.7714, "num_tokens": 20476564.0, "step": 135 }, { "epoch": 0.041310120979640014, "grad_norm": 0.3844706514188257, "learning_rate": 6.833824975417897e-06, "loss": 0.791, "num_tokens": 21239188.0, "step": 140 }, { "epoch": 0.04278548244319858, "grad_norm": 0.4240688845862792, "learning_rate": 7.079646017699115e-06, "loss": 0.7905, "num_tokens": 21968157.0, "step": 145 }, { "epoch": 0.04426084390675716, "grad_norm": 0.40009855480181267, "learning_rate": 7.325467059980334e-06, "loss": 0.7721, "num_tokens": 22729223.0, "step": 150 }, { "epoch": 0.045736205370315725, "grad_norm": 0.37878044793628346, "learning_rate": 7.571288102261554e-06, "loss": 0.7739, "num_tokens": 23510729.0, "step": 155 }, { "epoch": 0.0472115668338743, "grad_norm": 0.3815343898719683, "learning_rate": 7.817109144542774e-06, "loss": 0.7824, "num_tokens": 24297810.0, "step": 160 }, { "epoch": 0.048686928297432874, "grad_norm": 0.40760834126074463, "learning_rate": 8.062930186823993e-06, "loss": 0.7765, "num_tokens": 25030885.0, "step": 165 }, { "epoch": 0.05016228976099144, "grad_norm": 0.4238204731712286, "learning_rate": 8.308751229105212e-06, "loss": 0.76, "num_tokens": 25826875.0, "step": 170 }, { "epoch": 0.05163765122455002, "grad_norm": 0.40409693314815015, "learning_rate": 8.554572271386432e-06, "loss": 0.7747, "num_tokens": 26600967.0, "step": 175 }, { "epoch": 0.053113012688108585, "grad_norm": 0.4004161960327109, "learning_rate": 8.80039331366765e-06, "loss": 0.7625, "num_tokens": 27312339.0, "step": 180 }, { "epoch": 0.05458837415166716, "grad_norm": 0.4356576954078916, "learning_rate": 9.046214355948869e-06, "loss": 0.7606, "num_tokens": 28004069.0, "step": 185 }, { "epoch": 0.05606373561522573, "grad_norm": 0.4167221187740886, "learning_rate": 9.29203539823009e-06, "loss": 0.7635, "num_tokens": 28769019.0, "step": 190 }, { "epoch": 0.0575390970787843, "grad_norm": 0.37605416112550655, "learning_rate": 9.537856440511309e-06, "loss": 0.7454, "num_tokens": 29540607.0, "step": 195 }, { "epoch": 0.05901445854234287, "grad_norm": 0.3928154675215524, "learning_rate": 9.783677482792528e-06, "loss": 0.7515, "num_tokens": 30260453.0, "step": 200 }, { "epoch": 0.060489820005901446, "grad_norm": 0.3895142700317512, "learning_rate": 1.0029498525073746e-05, "loss": 0.7558, "num_tokens": 30971852.0, "step": 205 }, { "epoch": 0.06196518146946002, "grad_norm": 0.4237599274695581, "learning_rate": 1.0275319567354965e-05, "loss": 0.7408, "num_tokens": 31740278.0, "step": 210 }, { "epoch": 0.0634405429330186, "grad_norm": 0.3754884822274214, "learning_rate": 1.0521140609636185e-05, "loss": 0.7608, "num_tokens": 32517474.0, "step": 215 }, { "epoch": 0.06491590439657716, "grad_norm": 0.4017995472474209, "learning_rate": 1.0766961651917406e-05, "loss": 0.7612, "num_tokens": 33321921.0, "step": 220 }, { "epoch": 0.06639126586013573, "grad_norm": 0.4555768767363855, "learning_rate": 1.1012782694198625e-05, "loss": 0.7477, "num_tokens": 34072313.0, "step": 225 }, { "epoch": 0.0678666273236943, "grad_norm": 0.4135807308659429, "learning_rate": 1.1258603736479843e-05, "loss": 0.7424, "num_tokens": 34842857.0, "step": 230 }, { "epoch": 0.06934198878725288, "grad_norm": 0.40640059040634813, "learning_rate": 1.1504424778761062e-05, "loss": 0.7511, "num_tokens": 35590470.0, "step": 235 }, { "epoch": 0.07081735025081144, "grad_norm": 0.43786481749552625, "learning_rate": 1.1750245821042281e-05, "loss": 0.7301, "num_tokens": 36329551.0, "step": 240 }, { "epoch": 0.07229271171437002, "grad_norm": 0.38743175364146676, "learning_rate": 1.19960668633235e-05, "loss": 0.7481, "num_tokens": 37074291.0, "step": 245 }, { "epoch": 0.07376807317792859, "grad_norm": 0.37985913982890745, "learning_rate": 1.2241887905604722e-05, "loss": 0.7403, "num_tokens": 37856535.0, "step": 250 }, { "epoch": 0.07524343464148717, "grad_norm": 0.39214385833920623, "learning_rate": 1.2487708947885939e-05, "loss": 0.7295, "num_tokens": 38616757.0, "step": 255 }, { "epoch": 0.07671879610504574, "grad_norm": 0.42415112538584987, "learning_rate": 1.2733529990167158e-05, "loss": 0.755, "num_tokens": 39327360.0, "step": 260 }, { "epoch": 0.0781941575686043, "grad_norm": 0.4209387661346241, "learning_rate": 1.2979351032448378e-05, "loss": 0.729, "num_tokens": 40045952.0, "step": 265 }, { "epoch": 0.07966951903216288, "grad_norm": 0.49181873330033754, "learning_rate": 1.3225172074729597e-05, "loss": 0.7509, "num_tokens": 40859806.0, "step": 270 }, { "epoch": 0.08114488049572145, "grad_norm": 0.4628035081908318, "learning_rate": 1.3470993117010816e-05, "loss": 0.7192, "num_tokens": 41554556.0, "step": 275 }, { "epoch": 0.08262024195928003, "grad_norm": 0.38350172413015365, "learning_rate": 1.3716814159292036e-05, "loss": 0.7392, "num_tokens": 42277063.0, "step": 280 }, { "epoch": 0.0840956034228386, "grad_norm": 0.4132574447757612, "learning_rate": 1.3962635201573255e-05, "loss": 0.7252, "num_tokens": 43039640.0, "step": 285 }, { "epoch": 0.08557096488639716, "grad_norm": 0.45256212603231455, "learning_rate": 1.4208456243854473e-05, "loss": 0.7309, "num_tokens": 43818099.0, "step": 290 }, { "epoch": 0.08704632634995574, "grad_norm": 0.43501522916788044, "learning_rate": 1.4454277286135695e-05, "loss": 0.7386, "num_tokens": 44592317.0, "step": 295 }, { "epoch": 0.08852168781351431, "grad_norm": 0.4092727640383535, "learning_rate": 1.4700098328416915e-05, "loss": 0.7344, "num_tokens": 45362908.0, "step": 300 }, { "epoch": 0.08999704927707289, "grad_norm": 0.44153305416917676, "learning_rate": 1.4945919370698134e-05, "loss": 0.7471, "num_tokens": 46082574.0, "step": 305 }, { "epoch": 0.09147241074063145, "grad_norm": 0.4198024357239949, "learning_rate": 1.5191740412979352e-05, "loss": 0.7236, "num_tokens": 46860982.0, "step": 310 }, { "epoch": 0.09294777220419002, "grad_norm": 0.3724435315454752, "learning_rate": 1.543756145526057e-05, "loss": 0.7397, "num_tokens": 47669244.0, "step": 315 }, { "epoch": 0.0944231336677486, "grad_norm": 0.37467153374936435, "learning_rate": 1.568338249754179e-05, "loss": 0.728, "num_tokens": 48475836.0, "step": 320 }, { "epoch": 0.09589849513130717, "grad_norm": 0.41081789323709744, "learning_rate": 1.592920353982301e-05, "loss": 0.723, "num_tokens": 49257132.0, "step": 325 }, { "epoch": 0.09737385659486575, "grad_norm": 0.42022934946996715, "learning_rate": 1.617502458210423e-05, "loss": 0.7077, "num_tokens": 50006246.0, "step": 330 }, { "epoch": 0.09884921805842431, "grad_norm": 0.4364236984553688, "learning_rate": 1.6420845624385448e-05, "loss": 0.7308, "num_tokens": 50796262.0, "step": 335 }, { "epoch": 0.10032457952198288, "grad_norm": 0.4111324571881721, "learning_rate": 1.6666666666666667e-05, "loss": 0.7241, "num_tokens": 51536978.0, "step": 340 }, { "epoch": 0.10179994098554146, "grad_norm": 0.40545628162471986, "learning_rate": 1.6912487708947887e-05, "loss": 0.7291, "num_tokens": 52261392.0, "step": 345 }, { "epoch": 0.10327530244910003, "grad_norm": 0.42267964920957396, "learning_rate": 1.7158308751229106e-05, "loss": 0.7272, "num_tokens": 52994509.0, "step": 350 }, { "epoch": 0.1047506639126586, "grad_norm": 0.4448527147226747, "learning_rate": 1.7404129793510325e-05, "loss": 0.7247, "num_tokens": 53729232.0, "step": 355 }, { "epoch": 0.10622602537621717, "grad_norm": 0.4305015993528053, "learning_rate": 1.7649950835791545e-05, "loss": 0.7303, "num_tokens": 54458711.0, "step": 360 }, { "epoch": 0.10770138683977575, "grad_norm": 0.4439316067902174, "learning_rate": 1.7895771878072764e-05, "loss": 0.7036, "num_tokens": 55195563.0, "step": 365 }, { "epoch": 0.10917674830333432, "grad_norm": 0.4579551610050442, "learning_rate": 1.8141592920353983e-05, "loss": 0.6951, "num_tokens": 55871795.0, "step": 370 }, { "epoch": 0.1106521097668929, "grad_norm": 0.39493788459465773, "learning_rate": 1.8387413962635203e-05, "loss": 0.7283, "num_tokens": 56633773.0, "step": 375 }, { "epoch": 0.11212747123045146, "grad_norm": 0.46922754987576215, "learning_rate": 1.8633235004916422e-05, "loss": 0.7108, "num_tokens": 57367856.0, "step": 380 }, { "epoch": 0.11360283269401003, "grad_norm": 0.41224659317766926, "learning_rate": 1.887905604719764e-05, "loss": 0.6988, "num_tokens": 58068855.0, "step": 385 }, { "epoch": 0.1150781941575686, "grad_norm": 0.4201552104982245, "learning_rate": 1.912487708947886e-05, "loss": 0.7311, "num_tokens": 58878411.0, "step": 390 }, { "epoch": 0.11655355562112718, "grad_norm": 0.3825082232269288, "learning_rate": 1.937069813176008e-05, "loss": 0.7233, "num_tokens": 59715294.0, "step": 395 }, { "epoch": 0.11802891708468574, "grad_norm": 0.38051393310995324, "learning_rate": 1.96165191740413e-05, "loss": 0.715, "num_tokens": 60531520.0, "step": 400 }, { "epoch": 0.11950427854824432, "grad_norm": 0.4127948125363786, "learning_rate": 1.986234021632252e-05, "loss": 0.7158, "num_tokens": 61284624.0, "step": 405 }, { "epoch": 0.12097964001180289, "grad_norm": 0.4066782749481335, "learning_rate": 2.0108161258603738e-05, "loss": 0.7104, "num_tokens": 61976000.0, "step": 410 }, { "epoch": 0.12245500147536147, "grad_norm": 0.4210436094996318, "learning_rate": 2.0353982300884957e-05, "loss": 0.737, "num_tokens": 62777045.0, "step": 415 }, { "epoch": 0.12393036293892004, "grad_norm": 0.3898298505593025, "learning_rate": 2.0599803343166176e-05, "loss": 0.7279, "num_tokens": 63580273.0, "step": 420 }, { "epoch": 0.1254057244024786, "grad_norm": 0.40182139335087225, "learning_rate": 2.0845624385447396e-05, "loss": 0.7045, "num_tokens": 64326334.0, "step": 425 }, { "epoch": 0.1268810858660372, "grad_norm": 0.4078597305921958, "learning_rate": 2.1091445427728615e-05, "loss": 0.6845, "num_tokens": 65009540.0, "step": 430 }, { "epoch": 0.12835644732959575, "grad_norm": 0.38608961640134243, "learning_rate": 2.1337266470009834e-05, "loss": 0.714, "num_tokens": 65835091.0, "step": 435 }, { "epoch": 0.1298318087931543, "grad_norm": 0.3784195114904803, "learning_rate": 2.1583087512291054e-05, "loss": 0.6929, "num_tokens": 66594502.0, "step": 440 }, { "epoch": 0.1313071702567129, "grad_norm": 0.43060968261999943, "learning_rate": 2.1828908554572273e-05, "loss": 0.7125, "num_tokens": 67362572.0, "step": 445 }, { "epoch": 0.13278253172027146, "grad_norm": 0.3841365592198629, "learning_rate": 2.2074729596853492e-05, "loss": 0.7038, "num_tokens": 68122824.0, "step": 450 }, { "epoch": 0.13425789318383005, "grad_norm": 0.43753993060419816, "learning_rate": 2.232055063913471e-05, "loss": 0.7234, "num_tokens": 68858267.0, "step": 455 }, { "epoch": 0.1357332546473886, "grad_norm": 0.45267575986393793, "learning_rate": 2.2566371681415928e-05, "loss": 0.7103, "num_tokens": 69647103.0, "step": 460 }, { "epoch": 0.13720861611094717, "grad_norm": 0.4378093902225933, "learning_rate": 2.2812192723697147e-05, "loss": 0.7002, "num_tokens": 70373095.0, "step": 465 }, { "epoch": 0.13868397757450576, "grad_norm": 0.4244533874817389, "learning_rate": 2.305801376597837e-05, "loss": 0.7233, "num_tokens": 71132929.0, "step": 470 }, { "epoch": 0.14015933903806432, "grad_norm": 0.4077836133951298, "learning_rate": 2.330383480825959e-05, "loss": 0.6966, "num_tokens": 71864623.0, "step": 475 }, { "epoch": 0.14163470050162288, "grad_norm": 0.42112537319850096, "learning_rate": 2.3549655850540808e-05, "loss": 0.6981, "num_tokens": 72582832.0, "step": 480 }, { "epoch": 0.14311006196518147, "grad_norm": 0.41152255844132724, "learning_rate": 2.3795476892822028e-05, "loss": 0.7108, "num_tokens": 73399108.0, "step": 485 }, { "epoch": 0.14458542342874003, "grad_norm": 0.4032468707687444, "learning_rate": 2.4041297935103247e-05, "loss": 0.7237, "num_tokens": 74154013.0, "step": 490 }, { "epoch": 0.14606078489229862, "grad_norm": 0.40490808398276473, "learning_rate": 2.4287118977384466e-05, "loss": 0.7188, "num_tokens": 74956274.0, "step": 495 }, { "epoch": 0.14753614635585718, "grad_norm": 0.3757256597630011, "learning_rate": 2.4532940019665686e-05, "loss": 0.7226, "num_tokens": 75800308.0, "step": 500 }, { "epoch": 0.14901150781941574, "grad_norm": 0.41472699471778185, "learning_rate": 2.4778761061946905e-05, "loss": 0.6947, "num_tokens": 76524761.0, "step": 505 }, { "epoch": 0.15048686928297433, "grad_norm": 0.40486142552782267, "learning_rate": 2.5024582104228124e-05, "loss": 0.7071, "num_tokens": 77282077.0, "step": 510 }, { "epoch": 0.1519622307465329, "grad_norm": 0.3904643227172614, "learning_rate": 2.527040314650934e-05, "loss": 0.6965, "num_tokens": 78052232.0, "step": 515 }, { "epoch": 0.15343759221009148, "grad_norm": 0.4475383935213542, "learning_rate": 2.551622418879056e-05, "loss": 0.7037, "num_tokens": 78755032.0, "step": 520 }, { "epoch": 0.15491295367365004, "grad_norm": 0.391686447527812, "learning_rate": 2.576204523107178e-05, "loss": 0.6971, "num_tokens": 79526071.0, "step": 525 }, { "epoch": 0.1563883151372086, "grad_norm": 0.42264578877057274, "learning_rate": 2.6007866273352998e-05, "loss": 0.7031, "num_tokens": 80298945.0, "step": 530 }, { "epoch": 0.1578636766007672, "grad_norm": 0.421589262048939, "learning_rate": 2.6253687315634217e-05, "loss": 0.7102, "num_tokens": 81098983.0, "step": 535 }, { "epoch": 0.15933903806432576, "grad_norm": 0.4803024094191547, "learning_rate": 2.6499508357915437e-05, "loss": 0.7075, "num_tokens": 81870547.0, "step": 540 }, { "epoch": 0.16081439952788434, "grad_norm": 0.40334287275173236, "learning_rate": 2.6745329400196656e-05, "loss": 0.7047, "num_tokens": 82615746.0, "step": 545 }, { "epoch": 0.1622897609914429, "grad_norm": 0.40767779550044786, "learning_rate": 2.6991150442477875e-05, "loss": 0.6801, "num_tokens": 83322826.0, "step": 550 }, { "epoch": 0.16376512245500147, "grad_norm": 0.40106789232481677, "learning_rate": 2.7236971484759095e-05, "loss": 0.6921, "num_tokens": 84103567.0, "step": 555 }, { "epoch": 0.16524048391856005, "grad_norm": 0.39517290365324104, "learning_rate": 2.7482792527040317e-05, "loss": 0.7122, "num_tokens": 84908610.0, "step": 560 }, { "epoch": 0.16671584538211862, "grad_norm": 0.4632790433901448, "learning_rate": 2.7728613569321537e-05, "loss": 0.6999, "num_tokens": 85633905.0, "step": 565 }, { "epoch": 0.1681912068456772, "grad_norm": 0.3916513935901957, "learning_rate": 2.7974434611602756e-05, "loss": 0.707, "num_tokens": 86458546.0, "step": 570 }, { "epoch": 0.16966656830923577, "grad_norm": 0.3846723105743189, "learning_rate": 2.8220255653883975e-05, "loss": 0.6869, "num_tokens": 87245841.0, "step": 575 }, { "epoch": 0.17114192977279433, "grad_norm": 0.37389651241107263, "learning_rate": 2.8466076696165195e-05, "loss": 0.702, "num_tokens": 88017001.0, "step": 580 }, { "epoch": 0.17261729123635292, "grad_norm": 0.37360313527612743, "learning_rate": 2.8711897738446414e-05, "loss": 0.6822, "num_tokens": 88787916.0, "step": 585 }, { "epoch": 0.17409265269991148, "grad_norm": 0.40485064474622706, "learning_rate": 2.8957718780727633e-05, "loss": 0.6774, "num_tokens": 89552545.0, "step": 590 }, { "epoch": 0.17556801416347004, "grad_norm": 0.428580086443521, "learning_rate": 2.9203539823008852e-05, "loss": 0.7127, "num_tokens": 90353898.0, "step": 595 }, { "epoch": 0.17704337562702863, "grad_norm": 0.3540271726377656, "learning_rate": 2.9449360865290072e-05, "loss": 0.6967, "num_tokens": 91122358.0, "step": 600 }, { "epoch": 0.1785187370905872, "grad_norm": 0.3690390695674548, "learning_rate": 2.969518190757129e-05, "loss": 0.7207, "num_tokens": 91924555.0, "step": 605 }, { "epoch": 0.17999409855414578, "grad_norm": 0.3691021233405591, "learning_rate": 2.994100294985251e-05, "loss": 0.6797, "num_tokens": 92699419.0, "step": 610 }, { "epoch": 0.18146946001770434, "grad_norm": 0.37647170263440116, "learning_rate": 3.018682399213373e-05, "loss": 0.6979, "num_tokens": 93480688.0, "step": 615 }, { "epoch": 0.1829448214812629, "grad_norm": 0.3943391732403247, "learning_rate": 3.0432645034414946e-05, "loss": 0.6844, "num_tokens": 94239820.0, "step": 620 }, { "epoch": 0.1844201829448215, "grad_norm": 0.4104799483768576, "learning_rate": 3.0678466076696165e-05, "loss": 0.6717, "num_tokens": 94952891.0, "step": 625 }, { "epoch": 0.18589554440838005, "grad_norm": 0.3762682552746218, "learning_rate": 3.092428711897739e-05, "loss": 0.6924, "num_tokens": 95726090.0, "step": 630 }, { "epoch": 0.18737090587193864, "grad_norm": 0.3942303704776413, "learning_rate": 3.1170108161258604e-05, "loss": 0.6899, "num_tokens": 96521356.0, "step": 635 }, { "epoch": 0.1888462673354972, "grad_norm": 0.39898691038597256, "learning_rate": 3.1415929203539826e-05, "loss": 0.6898, "num_tokens": 97234443.0, "step": 640 }, { "epoch": 0.19032162879905576, "grad_norm": 0.4158342552687878, "learning_rate": 3.166175024582104e-05, "loss": 0.7147, "num_tokens": 98013847.0, "step": 645 }, { "epoch": 0.19179699026261435, "grad_norm": 0.3972747794117785, "learning_rate": 3.190757128810226e-05, "loss": 0.687, "num_tokens": 98776398.0, "step": 650 }, { "epoch": 0.1932723517261729, "grad_norm": 0.3751203989747597, "learning_rate": 3.215339233038348e-05, "loss": 0.6796, "num_tokens": 99583235.0, "step": 655 }, { "epoch": 0.1947477131897315, "grad_norm": 0.42070712610539446, "learning_rate": 3.23992133726647e-05, "loss": 0.6801, "num_tokens": 100303941.0, "step": 660 }, { "epoch": 0.19622307465329006, "grad_norm": 0.4250386425485876, "learning_rate": 3.264503441494592e-05, "loss": 0.689, "num_tokens": 101026411.0, "step": 665 }, { "epoch": 0.19769843611684862, "grad_norm": 0.3611304164299744, "learning_rate": 3.2890855457227135e-05, "loss": 0.7109, "num_tokens": 101863256.0, "step": 670 }, { "epoch": 0.1991737975804072, "grad_norm": 0.3979101976773354, "learning_rate": 3.313667649950836e-05, "loss": 0.6752, "num_tokens": 102634376.0, "step": 675 }, { "epoch": 0.20064915904396577, "grad_norm": 0.3913017710888343, "learning_rate": 3.338249754178958e-05, "loss": 0.6856, "num_tokens": 103397283.0, "step": 680 }, { "epoch": 0.20212452050752433, "grad_norm": 0.3791639891347238, "learning_rate": 3.3628318584070804e-05, "loss": 0.6921, "num_tokens": 104202732.0, "step": 685 }, { "epoch": 0.20359988197108292, "grad_norm": 0.38684255895503106, "learning_rate": 3.387413962635202e-05, "loss": 0.6911, "num_tokens": 104995072.0, "step": 690 }, { "epoch": 0.20507524343464148, "grad_norm": 0.3797812167834148, "learning_rate": 3.4119960668633235e-05, "loss": 0.6713, "num_tokens": 105719376.0, "step": 695 }, { "epoch": 0.20655060489820007, "grad_norm": 0.3756461534423814, "learning_rate": 3.436578171091446e-05, "loss": 0.6817, "num_tokens": 106498400.0, "step": 700 }, { "epoch": 0.20802596636175863, "grad_norm": 0.39415957308711, "learning_rate": 3.4611602753195674e-05, "loss": 0.6999, "num_tokens": 107248908.0, "step": 705 }, { "epoch": 0.2095013278253172, "grad_norm": 0.3493996962660629, "learning_rate": 3.48574237954769e-05, "loss": 0.6664, "num_tokens": 107987635.0, "step": 710 }, { "epoch": 0.21097668928887578, "grad_norm": 0.439642314751113, "learning_rate": 3.510324483775811e-05, "loss": 0.6769, "num_tokens": 108722380.0, "step": 715 }, { "epoch": 0.21245205075243434, "grad_norm": 0.3829187704457303, "learning_rate": 3.5349065880039335e-05, "loss": 0.6833, "num_tokens": 109482031.0, "step": 720 }, { "epoch": 0.21392741221599293, "grad_norm": 0.35321728011750486, "learning_rate": 3.559488692232055e-05, "loss": 0.69, "num_tokens": 110247934.0, "step": 725 }, { "epoch": 0.2154027736795515, "grad_norm": 0.3833498608624317, "learning_rate": 3.5840707964601774e-05, "loss": 0.6915, "num_tokens": 111020872.0, "step": 730 }, { "epoch": 0.21687813514311005, "grad_norm": 0.3894250927584196, "learning_rate": 3.608652900688299e-05, "loss": 0.7007, "num_tokens": 111744189.0, "step": 735 }, { "epoch": 0.21835349660666864, "grad_norm": 0.3487189748915444, "learning_rate": 3.633235004916421e-05, "loss": 0.6859, "num_tokens": 112543991.0, "step": 740 }, { "epoch": 0.2198288580702272, "grad_norm": 0.34476452016457987, "learning_rate": 3.657817109144543e-05, "loss": 0.6957, "num_tokens": 113307710.0, "step": 745 }, { "epoch": 0.2213042195337858, "grad_norm": 0.3737716846314369, "learning_rate": 3.682399213372665e-05, "loss": 0.6701, "num_tokens": 114049429.0, "step": 750 }, { "epoch": 0.22277958099734435, "grad_norm": 0.3858650845938994, "learning_rate": 3.706981317600787e-05, "loss": 0.6731, "num_tokens": 114801584.0, "step": 755 }, { "epoch": 0.2242549424609029, "grad_norm": 0.3901453727294595, "learning_rate": 3.731563421828908e-05, "loss": 0.6694, "num_tokens": 115515979.0, "step": 760 }, { "epoch": 0.2257303039244615, "grad_norm": 0.48151160306320545, "learning_rate": 3.7561455260570306e-05, "loss": 0.6599, "num_tokens": 116254401.0, "step": 765 }, { "epoch": 0.22720566538802006, "grad_norm": 0.3682023392339237, "learning_rate": 3.780727630285152e-05, "loss": 0.678, "num_tokens": 117002813.0, "step": 770 }, { "epoch": 0.22868102685157865, "grad_norm": 0.35372496907907897, "learning_rate": 3.8053097345132744e-05, "loss": 0.6681, "num_tokens": 117714749.0, "step": 775 }, { "epoch": 0.2301563883151372, "grad_norm": 0.42957863599813334, "learning_rate": 3.829891838741396e-05, "loss": 0.6809, "num_tokens": 118458530.0, "step": 780 }, { "epoch": 0.23163174977869577, "grad_norm": 0.3790637912771868, "learning_rate": 3.854473942969518e-05, "loss": 0.7043, "num_tokens": 119256386.0, "step": 785 }, { "epoch": 0.23310711124225436, "grad_norm": 0.36100802375330554, "learning_rate": 3.87905604719764e-05, "loss": 0.689, "num_tokens": 120016494.0, "step": 790 }, { "epoch": 0.23458247270581292, "grad_norm": 0.37991396355406726, "learning_rate": 3.903638151425762e-05, "loss": 0.688, "num_tokens": 120785476.0, "step": 795 }, { "epoch": 0.23605783416937148, "grad_norm": 0.35999377574829533, "learning_rate": 3.9282202556538844e-05, "loss": 0.6849, "num_tokens": 121543912.0, "step": 800 }, { "epoch": 0.23753319563293007, "grad_norm": 0.37242577746295585, "learning_rate": 3.952802359882006e-05, "loss": 0.6768, "num_tokens": 122338940.0, "step": 805 }, { "epoch": 0.23900855709648863, "grad_norm": 0.40244109271344947, "learning_rate": 3.977384464110128e-05, "loss": 0.6846, "num_tokens": 123073037.0, "step": 810 }, { "epoch": 0.24048391856004722, "grad_norm": 0.3427701452587553, "learning_rate": 4.00196656833825e-05, "loss": 0.6919, "num_tokens": 123889735.0, "step": 815 }, { "epoch": 0.24195928002360578, "grad_norm": 0.3533102867054504, "learning_rate": 4.026548672566372e-05, "loss": 0.6679, "num_tokens": 124661247.0, "step": 820 }, { "epoch": 0.24343464148716434, "grad_norm": 0.32419794014807934, "learning_rate": 4.051130776794494e-05, "loss": 0.68, "num_tokens": 125441452.0, "step": 825 }, { "epoch": 0.24491000295072293, "grad_norm": 0.4143694736008246, "learning_rate": 4.075712881022616e-05, "loss": 0.6668, "num_tokens": 126218495.0, "step": 830 }, { "epoch": 0.2463853644142815, "grad_norm": 0.36336193689168494, "learning_rate": 4.1002949852507376e-05, "loss": 0.6793, "num_tokens": 126935935.0, "step": 835 }, { "epoch": 0.24786072587784008, "grad_norm": 0.35876479780822995, "learning_rate": 4.12487708947886e-05, "loss": 0.6832, "num_tokens": 127697015.0, "step": 840 }, { "epoch": 0.24933608734139864, "grad_norm": 0.39920958683419044, "learning_rate": 4.1494591937069815e-05, "loss": 0.6585, "num_tokens": 128415460.0, "step": 845 }, { "epoch": 0.2508114488049572, "grad_norm": 0.49720870761086855, "learning_rate": 4.174041297935104e-05, "loss": 0.6662, "num_tokens": 129132538.0, "step": 850 }, { "epoch": 0.2522868102685158, "grad_norm": 0.37044021202524946, "learning_rate": 4.1986234021632253e-05, "loss": 0.6734, "num_tokens": 129866195.0, "step": 855 }, { "epoch": 0.2537621717320744, "grad_norm": 0.36411111983072514, "learning_rate": 4.223205506391347e-05, "loss": 0.667, "num_tokens": 130627904.0, "step": 860 }, { "epoch": 0.2552375331956329, "grad_norm": 0.3383293872778588, "learning_rate": 4.247787610619469e-05, "loss": 0.6674, "num_tokens": 131403897.0, "step": 865 }, { "epoch": 0.2567128946591915, "grad_norm": 0.3494701020308515, "learning_rate": 4.272369714847591e-05, "loss": 0.6693, "num_tokens": 132148103.0, "step": 870 }, { "epoch": 0.2581882561227501, "grad_norm": 0.3784958879688386, "learning_rate": 4.296951819075713e-05, "loss": 0.6868, "num_tokens": 132942670.0, "step": 875 }, { "epoch": 0.2596636175863086, "grad_norm": 0.36802437553639705, "learning_rate": 4.321533923303835e-05, "loss": 0.6677, "num_tokens": 133730152.0, "step": 880 }, { "epoch": 0.2611389790498672, "grad_norm": 0.36264511033815705, "learning_rate": 4.346116027531957e-05, "loss": 0.6747, "num_tokens": 134479054.0, "step": 885 }, { "epoch": 0.2626143405134258, "grad_norm": 0.3522347069193622, "learning_rate": 4.3706981317600785e-05, "loss": 0.6732, "num_tokens": 135210573.0, "step": 890 }, { "epoch": 0.26408970197698434, "grad_norm": 0.35989270050160205, "learning_rate": 4.395280235988201e-05, "loss": 0.6819, "num_tokens": 135972041.0, "step": 895 }, { "epoch": 0.2655650634405429, "grad_norm": 0.36204443427970534, "learning_rate": 4.4198623402163224e-05, "loss": 0.6645, "num_tokens": 136756851.0, "step": 900 }, { "epoch": 0.2670404249041015, "grad_norm": 0.3556891000563905, "learning_rate": 4.4444444444444447e-05, "loss": 0.659, "num_tokens": 137522921.0, "step": 905 }, { "epoch": 0.2685157863676601, "grad_norm": 0.3635236742102664, "learning_rate": 4.469026548672566e-05, "loss": 0.6895, "num_tokens": 138279844.0, "step": 910 }, { "epoch": 0.26999114783121864, "grad_norm": 0.38620420776062314, "learning_rate": 4.4936086529006885e-05, "loss": 0.664, "num_tokens": 139057714.0, "step": 915 }, { "epoch": 0.2714665092947772, "grad_norm": 0.36612870912165896, "learning_rate": 4.518190757128811e-05, "loss": 0.6831, "num_tokens": 139873790.0, "step": 920 }, { "epoch": 0.2729418707583358, "grad_norm": 0.31214237994162564, "learning_rate": 4.5427728613569324e-05, "loss": 0.6786, "num_tokens": 140646057.0, "step": 925 }, { "epoch": 0.27441723222189435, "grad_norm": 0.3140422893806558, "learning_rate": 4.5673549655850547e-05, "loss": 0.667, "num_tokens": 141388664.0, "step": 930 }, { "epoch": 0.27589259368545294, "grad_norm": 0.3669525525375058, "learning_rate": 4.591937069813176e-05, "loss": 0.6571, "num_tokens": 142115197.0, "step": 935 }, { "epoch": 0.2773679551490115, "grad_norm": 0.3704484853640469, "learning_rate": 4.6165191740412985e-05, "loss": 0.6745, "num_tokens": 142931999.0, "step": 940 }, { "epoch": 0.27884331661257006, "grad_norm": 0.3634672451743246, "learning_rate": 4.64110127826942e-05, "loss": 0.652, "num_tokens": 143747221.0, "step": 945 }, { "epoch": 0.28031867807612865, "grad_norm": 0.37873401147371566, "learning_rate": 4.6656833824975424e-05, "loss": 0.6569, "num_tokens": 144522728.0, "step": 950 }, { "epoch": 0.28179403953968724, "grad_norm": 0.35168853704448144, "learning_rate": 4.690265486725664e-05, "loss": 0.6768, "num_tokens": 145308349.0, "step": 955 }, { "epoch": 0.28326940100324577, "grad_norm": 0.33997202951960315, "learning_rate": 4.714847590953786e-05, "loss": 0.6497, "num_tokens": 146024776.0, "step": 960 }, { "epoch": 0.28474476246680436, "grad_norm": 0.3392663310576652, "learning_rate": 4.739429695181908e-05, "loss": 0.6639, "num_tokens": 146813996.0, "step": 965 }, { "epoch": 0.28622012393036295, "grad_norm": 0.35100784969820337, "learning_rate": 4.7640117994100294e-05, "loss": 0.6707, "num_tokens": 147556524.0, "step": 970 }, { "epoch": 0.28769548539392154, "grad_norm": 0.3503456929965031, "learning_rate": 4.788593903638152e-05, "loss": 0.67, "num_tokens": 148308320.0, "step": 975 }, { "epoch": 0.28917084685748007, "grad_norm": 0.35707493305134397, "learning_rate": 4.813176007866273e-05, "loss": 0.677, "num_tokens": 149073189.0, "step": 980 }, { "epoch": 0.29064620832103866, "grad_norm": 0.3322189629134223, "learning_rate": 4.8377581120943956e-05, "loss": 0.6572, "num_tokens": 149766691.0, "step": 985 }, { "epoch": 0.29212156978459725, "grad_norm": 0.4011866463647839, "learning_rate": 4.862340216322517e-05, "loss": 0.6679, "num_tokens": 150499445.0, "step": 990 }, { "epoch": 0.2935969312481558, "grad_norm": 0.45864557689947566, "learning_rate": 4.8869223205506394e-05, "loss": 0.6661, "num_tokens": 151295891.0, "step": 995 }, { "epoch": 0.29507229271171437, "grad_norm": 0.3569010083422384, "learning_rate": 4.911504424778761e-05, "loss": 0.6716, "num_tokens": 152094642.0, "step": 1000 }, { "epoch": 0.29654765417527296, "grad_norm": 0.3857213025510855, "learning_rate": 4.936086529006883e-05, "loss": 0.6743, "num_tokens": 152821318.0, "step": 1005 }, { "epoch": 0.2980230156388315, "grad_norm": 0.36506185553448, "learning_rate": 4.960668633235005e-05, "loss": 0.6618, "num_tokens": 153569409.0, "step": 1010 }, { "epoch": 0.2994983771023901, "grad_norm": 0.42325902789011266, "learning_rate": 4.985250737463127e-05, "loss": 0.6835, "num_tokens": 154294985.0, "step": 1015 }, { "epoch": 0.30097373856594867, "grad_norm": 0.35305077708797855, "learning_rate": 4.998907103825137e-05, "loss": 0.6681, "num_tokens": 155101893.0, "step": 1020 }, { "epoch": 0.30244910002950726, "grad_norm": 0.35169801485045576, "learning_rate": 4.996174863387979e-05, "loss": 0.6687, "num_tokens": 155828239.0, "step": 1025 }, { "epoch": 0.3039244614930658, "grad_norm": 0.36734853527932554, "learning_rate": 4.99344262295082e-05, "loss": 0.6608, "num_tokens": 156589525.0, "step": 1030 }, { "epoch": 0.3053998229566244, "grad_norm": 0.3272631411520736, "learning_rate": 4.9907103825136614e-05, "loss": 0.6758, "num_tokens": 157404338.0, "step": 1035 }, { "epoch": 0.30687518442018297, "grad_norm": 0.3347305335100958, "learning_rate": 4.987978142076503e-05, "loss": 0.6638, "num_tokens": 158185940.0, "step": 1040 }, { "epoch": 0.3083505458837415, "grad_norm": 0.33977626118652765, "learning_rate": 4.985245901639344e-05, "loss": 0.6502, "num_tokens": 158960268.0, "step": 1045 }, { "epoch": 0.3098259073473001, "grad_norm": 0.3450174766633891, "learning_rate": 4.9825136612021864e-05, "loss": 0.6789, "num_tokens": 159770525.0, "step": 1050 }, { "epoch": 0.3113012688108587, "grad_norm": 0.3988220430387113, "learning_rate": 4.9797814207650274e-05, "loss": 0.6552, "num_tokens": 160451207.0, "step": 1055 }, { "epoch": 0.3127766302744172, "grad_norm": 0.40365864971915916, "learning_rate": 4.977049180327869e-05, "loss": 0.6361, "num_tokens": 161138191.0, "step": 1060 }, { "epoch": 0.3142519917379758, "grad_norm": 0.34955429276044014, "learning_rate": 4.974316939890711e-05, "loss": 0.6636, "num_tokens": 161871295.0, "step": 1065 }, { "epoch": 0.3157273532015344, "grad_norm": 0.3619296351414091, "learning_rate": 4.971584699453552e-05, "loss": 0.656, "num_tokens": 162609215.0, "step": 1070 }, { "epoch": 0.3172027146650929, "grad_norm": 0.43240474033639986, "learning_rate": 4.968852459016394e-05, "loss": 0.6642, "num_tokens": 163397954.0, "step": 1075 }, { "epoch": 0.3186780761286515, "grad_norm": 0.3554388715771833, "learning_rate": 4.966120218579236e-05, "loss": 0.6728, "num_tokens": 164122360.0, "step": 1080 }, { "epoch": 0.3201534375922101, "grad_norm": 0.32427564320204355, "learning_rate": 4.963387978142077e-05, "loss": 0.6609, "num_tokens": 164852955.0, "step": 1085 }, { "epoch": 0.3216287990557687, "grad_norm": 0.4017113013304154, "learning_rate": 4.9606557377049184e-05, "loss": 0.6671, "num_tokens": 165635267.0, "step": 1090 }, { "epoch": 0.3231041605193272, "grad_norm": 0.35934099825654253, "learning_rate": 4.9579234972677594e-05, "loss": 0.655, "num_tokens": 166331837.0, "step": 1095 }, { "epoch": 0.3245795219828858, "grad_norm": 0.34156894336066196, "learning_rate": 4.955191256830601e-05, "loss": 0.6474, "num_tokens": 167047828.0, "step": 1100 }, { "epoch": 0.3260548834464444, "grad_norm": 0.3299296244490823, "learning_rate": 4.9524590163934434e-05, "loss": 0.644, "num_tokens": 167739192.0, "step": 1105 }, { "epoch": 0.32753024491000293, "grad_norm": 0.3375444929188915, "learning_rate": 4.9497267759562844e-05, "loss": 0.6509, "num_tokens": 168550425.0, "step": 1110 }, { "epoch": 0.3290056063735615, "grad_norm": 0.34285207896723846, "learning_rate": 4.946994535519126e-05, "loss": 0.6617, "num_tokens": 169280950.0, "step": 1115 }, { "epoch": 0.3304809678371201, "grad_norm": 0.2875274793279173, "learning_rate": 4.944262295081967e-05, "loss": 0.6516, "num_tokens": 170064631.0, "step": 1120 }, { "epoch": 0.33195632930067864, "grad_norm": 0.3165445462845268, "learning_rate": 4.941530054644809e-05, "loss": 0.6542, "num_tokens": 170765780.0, "step": 1125 }, { "epoch": 0.33343169076423723, "grad_norm": 0.34526862471875347, "learning_rate": 4.9387978142076504e-05, "loss": 0.639, "num_tokens": 171460689.0, "step": 1130 }, { "epoch": 0.3349070522277958, "grad_norm": 0.36425239805447635, "learning_rate": 4.936065573770492e-05, "loss": 0.6605, "num_tokens": 172170287.0, "step": 1135 }, { "epoch": 0.3363824136913544, "grad_norm": 0.30517019017211816, "learning_rate": 4.933333333333334e-05, "loss": 0.6694, "num_tokens": 172965923.0, "step": 1140 }, { "epoch": 0.33785777515491294, "grad_norm": 0.30771830423050567, "learning_rate": 4.930601092896175e-05, "loss": 0.65, "num_tokens": 173716595.0, "step": 1145 }, { "epoch": 0.33933313661847153, "grad_norm": 0.31541939771094196, "learning_rate": 4.9278688524590164e-05, "loss": 0.6566, "num_tokens": 174460876.0, "step": 1150 }, { "epoch": 0.3408084980820301, "grad_norm": 0.3288822318302357, "learning_rate": 4.925136612021858e-05, "loss": 0.6805, "num_tokens": 175260841.0, "step": 1155 }, { "epoch": 0.34228385954558865, "grad_norm": 0.3311966617261668, "learning_rate": 4.9224043715847e-05, "loss": 0.6542, "num_tokens": 176035906.0, "step": 1160 }, { "epoch": 0.34375922100914724, "grad_norm": 0.3222340965897435, "learning_rate": 4.9196721311475414e-05, "loss": 0.6479, "num_tokens": 176794951.0, "step": 1165 }, { "epoch": 0.34523458247270583, "grad_norm": 0.3210786583856869, "learning_rate": 4.9169398907103824e-05, "loss": 0.6462, "num_tokens": 177593185.0, "step": 1170 }, { "epoch": 0.34670994393626436, "grad_norm": 0.3124853341481146, "learning_rate": 4.914207650273224e-05, "loss": 0.658, "num_tokens": 178398865.0, "step": 1175 }, { "epoch": 0.34818530539982295, "grad_norm": 0.3351816984670995, "learning_rate": 4.911475409836066e-05, "loss": 0.6607, "num_tokens": 179150922.0, "step": 1180 }, { "epoch": 0.34966066686338154, "grad_norm": 0.3415679509681532, "learning_rate": 4.9087431693989074e-05, "loss": 0.6611, "num_tokens": 179888253.0, "step": 1185 }, { "epoch": 0.3511360283269401, "grad_norm": 0.3151598651625712, "learning_rate": 4.906010928961749e-05, "loss": 0.6587, "num_tokens": 180678891.0, "step": 1190 }, { "epoch": 0.35261138979049866, "grad_norm": 0.3179899764688933, "learning_rate": 4.90327868852459e-05, "loss": 0.6497, "num_tokens": 181436860.0, "step": 1195 }, { "epoch": 0.35408675125405725, "grad_norm": 0.35455416416575886, "learning_rate": 4.900546448087432e-05, "loss": 0.6754, "num_tokens": 182242736.0, "step": 1200 }, { "epoch": 0.35556211271761584, "grad_norm": 0.31019001141472546, "learning_rate": 4.8978142076502734e-05, "loss": 0.6517, "num_tokens": 183011509.0, "step": 1205 }, { "epoch": 0.3570374741811744, "grad_norm": 0.33915584097955104, "learning_rate": 4.895081967213115e-05, "loss": 0.6509, "num_tokens": 183753127.0, "step": 1210 }, { "epoch": 0.35851283564473296, "grad_norm": 0.3003689676230856, "learning_rate": 4.892349726775957e-05, "loss": 0.6442, "num_tokens": 184496416.0, "step": 1215 }, { "epoch": 0.35998819710829155, "grad_norm": 0.3583841867995617, "learning_rate": 4.889617486338798e-05, "loss": 0.6617, "num_tokens": 185245956.0, "step": 1220 }, { "epoch": 0.3614635585718501, "grad_norm": 0.3129353325287402, "learning_rate": 4.8868852459016395e-05, "loss": 0.6377, "num_tokens": 186006421.0, "step": 1225 }, { "epoch": 0.3629389200354087, "grad_norm": 0.2934901782114118, "learning_rate": 4.884153005464481e-05, "loss": 0.6457, "num_tokens": 186782250.0, "step": 1230 }, { "epoch": 0.36441428149896726, "grad_norm": 0.32682886810990913, "learning_rate": 4.881420765027323e-05, "loss": 0.6498, "num_tokens": 187476402.0, "step": 1235 }, { "epoch": 0.3658896429625258, "grad_norm": 0.32908608559150454, "learning_rate": 4.8786885245901645e-05, "loss": 0.6415, "num_tokens": 188258611.0, "step": 1240 }, { "epoch": 0.3673650044260844, "grad_norm": 0.3433597411573385, "learning_rate": 4.8759562841530055e-05, "loss": 0.6396, "num_tokens": 189012164.0, "step": 1245 }, { "epoch": 0.368840365889643, "grad_norm": 0.35046138902463736, "learning_rate": 4.873224043715847e-05, "loss": 0.6345, "num_tokens": 189743830.0, "step": 1250 }, { "epoch": 0.3703157273532015, "grad_norm": 0.3379708235394342, "learning_rate": 4.870491803278689e-05, "loss": 0.6614, "num_tokens": 190535999.0, "step": 1255 }, { "epoch": 0.3717910888167601, "grad_norm": 0.3077249780167005, "learning_rate": 4.86775956284153e-05, "loss": 0.6648, "num_tokens": 191378459.0, "step": 1260 }, { "epoch": 0.3732664502803187, "grad_norm": 0.311303591971354, "learning_rate": 4.865027322404372e-05, "loss": 0.6421, "num_tokens": 192135792.0, "step": 1265 }, { "epoch": 0.3747418117438773, "grad_norm": 0.31095059195088925, "learning_rate": 4.862295081967214e-05, "loss": 0.6459, "num_tokens": 192956308.0, "step": 1270 }, { "epoch": 0.3762171732074358, "grad_norm": 0.3006426431732409, "learning_rate": 4.859562841530055e-05, "loss": 0.6693, "num_tokens": 193709665.0, "step": 1275 }, { "epoch": 0.3776925346709944, "grad_norm": 0.30751455287540796, "learning_rate": 4.8568306010928965e-05, "loss": 0.653, "num_tokens": 194485418.0, "step": 1280 }, { "epoch": 0.379167896134553, "grad_norm": 0.29888480089251374, "learning_rate": 4.8540983606557375e-05, "loss": 0.6494, "num_tokens": 195226826.0, "step": 1285 }, { "epoch": 0.3806432575981115, "grad_norm": 0.3452436042537902, "learning_rate": 4.85136612021858e-05, "loss": 0.6476, "num_tokens": 195890082.0, "step": 1290 }, { "epoch": 0.3821186190616701, "grad_norm": 0.2949970934889131, "learning_rate": 4.8486338797814215e-05, "loss": 0.6641, "num_tokens": 196681674.0, "step": 1295 }, { "epoch": 0.3835939805252287, "grad_norm": 0.3071586941122736, "learning_rate": 4.8459016393442625e-05, "loss": 0.6503, "num_tokens": 197442100.0, "step": 1300 }, { "epoch": 0.38506934198878723, "grad_norm": 0.30979986181907143, "learning_rate": 4.843169398907104e-05, "loss": 0.6306, "num_tokens": 198228293.0, "step": 1305 }, { "epoch": 0.3865447034523458, "grad_norm": 0.3089220277062248, "learning_rate": 4.840437158469945e-05, "loss": 0.6508, "num_tokens": 199000623.0, "step": 1310 }, { "epoch": 0.3880200649159044, "grad_norm": 0.33877672096050704, "learning_rate": 4.837704918032787e-05, "loss": 0.6462, "num_tokens": 199749726.0, "step": 1315 }, { "epoch": 0.389495426379463, "grad_norm": 0.30033293624590784, "learning_rate": 4.834972677595629e-05, "loss": 0.6374, "num_tokens": 200502517.0, "step": 1320 }, { "epoch": 0.39097078784302153, "grad_norm": 0.33846293525433274, "learning_rate": 4.83224043715847e-05, "loss": 0.642, "num_tokens": 201240862.0, "step": 1325 }, { "epoch": 0.3924461493065801, "grad_norm": 0.3270396072358627, "learning_rate": 4.829508196721312e-05, "loss": 0.6353, "num_tokens": 201982591.0, "step": 1330 }, { "epoch": 0.3939215107701387, "grad_norm": 0.3025516041281335, "learning_rate": 4.826775956284153e-05, "loss": 0.6361, "num_tokens": 202711436.0, "step": 1335 }, { "epoch": 0.39539687223369724, "grad_norm": 0.3103475465492609, "learning_rate": 4.8240437158469945e-05, "loss": 0.6623, "num_tokens": 203498010.0, "step": 1340 }, { "epoch": 0.3968722336972558, "grad_norm": 0.3027362819292761, "learning_rate": 4.821311475409837e-05, "loss": 0.6678, "num_tokens": 204308293.0, "step": 1345 }, { "epoch": 0.3983475951608144, "grad_norm": 0.37900889992636444, "learning_rate": 4.818579234972678e-05, "loss": 0.6448, "num_tokens": 205000883.0, "step": 1350 }, { "epoch": 0.39982295662437295, "grad_norm": 0.3014290283648565, "learning_rate": 4.8158469945355195e-05, "loss": 0.6409, "num_tokens": 205781978.0, "step": 1355 }, { "epoch": 0.40129831808793154, "grad_norm": 0.35281263573306537, "learning_rate": 4.8131147540983605e-05, "loss": 0.6597, "num_tokens": 206620924.0, "step": 1360 }, { "epoch": 0.4027736795514901, "grad_norm": 0.33180213391807445, "learning_rate": 4.810382513661202e-05, "loss": 0.6327, "num_tokens": 207364241.0, "step": 1365 }, { "epoch": 0.40424904101504866, "grad_norm": 0.301179937123833, "learning_rate": 4.807650273224044e-05, "loss": 0.6289, "num_tokens": 208118178.0, "step": 1370 }, { "epoch": 0.40572440247860725, "grad_norm": 0.2944463369836755, "learning_rate": 4.8049180327868855e-05, "loss": 0.6366, "num_tokens": 208894045.0, "step": 1375 }, { "epoch": 0.40719976394216584, "grad_norm": 0.3189949527311881, "learning_rate": 4.802185792349727e-05, "loss": 0.6361, "num_tokens": 209608638.0, "step": 1380 }, { "epoch": 0.4086751254057244, "grad_norm": 0.30421207347753815, "learning_rate": 4.799453551912568e-05, "loss": 0.6462, "num_tokens": 210426106.0, "step": 1385 }, { "epoch": 0.41015048686928296, "grad_norm": 0.32070987748482344, "learning_rate": 4.79672131147541e-05, "loss": 0.6193, "num_tokens": 211168363.0, "step": 1390 }, { "epoch": 0.41162584833284155, "grad_norm": 0.33396927197312415, "learning_rate": 4.7939890710382515e-05, "loss": 0.6564, "num_tokens": 211937393.0, "step": 1395 }, { "epoch": 0.41310120979640014, "grad_norm": 0.3214709972548983, "learning_rate": 4.791256830601093e-05, "loss": 0.6466, "num_tokens": 212655547.0, "step": 1400 }, { "epoch": 0.41457657125995867, "grad_norm": 0.3101363467346591, "learning_rate": 4.788524590163935e-05, "loss": 0.6523, "num_tokens": 213421804.0, "step": 1405 }, { "epoch": 0.41605193272351726, "grad_norm": 0.37128420893709335, "learning_rate": 4.785792349726776e-05, "loss": 0.6383, "num_tokens": 214161496.0, "step": 1410 }, { "epoch": 0.41752729418707585, "grad_norm": 0.27753727879764906, "learning_rate": 4.7830601092896175e-05, "loss": 0.6501, "num_tokens": 214939041.0, "step": 1415 }, { "epoch": 0.4190026556506344, "grad_norm": 0.31045343698938, "learning_rate": 4.780327868852459e-05, "loss": 0.6407, "num_tokens": 215683085.0, "step": 1420 }, { "epoch": 0.42047801711419297, "grad_norm": 0.3311225883937589, "learning_rate": 4.777595628415301e-05, "loss": 0.6457, "num_tokens": 216435057.0, "step": 1425 }, { "epoch": 0.42195337857775156, "grad_norm": 0.330582685726911, "learning_rate": 4.7748633879781425e-05, "loss": 0.623, "num_tokens": 217167958.0, "step": 1430 }, { "epoch": 0.42342874004131015, "grad_norm": 0.3274694330466004, "learning_rate": 4.7721311475409835e-05, "loss": 0.6267, "num_tokens": 217969684.0, "step": 1435 }, { "epoch": 0.4249041015048687, "grad_norm": 0.3392416325877164, "learning_rate": 4.769398907103825e-05, "loss": 0.6483, "num_tokens": 218724012.0, "step": 1440 }, { "epoch": 0.42637946296842727, "grad_norm": 0.3321651657187014, "learning_rate": 4.766666666666667e-05, "loss": 0.6517, "num_tokens": 219432824.0, "step": 1445 }, { "epoch": 0.42785482443198586, "grad_norm": 0.31696987176045804, "learning_rate": 4.7639344262295086e-05, "loss": 0.6166, "num_tokens": 220211915.0, "step": 1450 }, { "epoch": 0.4293301858955444, "grad_norm": 0.29742001780670496, "learning_rate": 4.76120218579235e-05, "loss": 0.6362, "num_tokens": 220978034.0, "step": 1455 }, { "epoch": 0.430805547359103, "grad_norm": 0.3142082929445483, "learning_rate": 4.758469945355192e-05, "loss": 0.6181, "num_tokens": 221706160.0, "step": 1460 }, { "epoch": 0.43228090882266157, "grad_norm": 0.3230387307997254, "learning_rate": 4.755737704918033e-05, "loss": 0.6396, "num_tokens": 222446723.0, "step": 1465 }, { "epoch": 0.4337562702862201, "grad_norm": 0.3181183206265134, "learning_rate": 4.7530054644808746e-05, "loss": 0.6388, "num_tokens": 223218336.0, "step": 1470 }, { "epoch": 0.4352316317497787, "grad_norm": 0.32919495853232633, "learning_rate": 4.7502732240437156e-05, "loss": 0.6252, "num_tokens": 223948803.0, "step": 1475 }, { "epoch": 0.4367069932133373, "grad_norm": 0.3021486744319727, "learning_rate": 4.747540983606558e-05, "loss": 0.6511, "num_tokens": 224755482.0, "step": 1480 }, { "epoch": 0.4381823546768958, "grad_norm": 0.3587894129471992, "learning_rate": 4.7448087431693996e-05, "loss": 0.6305, "num_tokens": 225553367.0, "step": 1485 }, { "epoch": 0.4396577161404544, "grad_norm": 0.354707269955595, "learning_rate": 4.7420765027322406e-05, "loss": 0.6316, "num_tokens": 226336512.0, "step": 1490 }, { "epoch": 0.441133077604013, "grad_norm": 0.2906583048691042, "learning_rate": 4.739344262295082e-05, "loss": 0.6316, "num_tokens": 227104810.0, "step": 1495 }, { "epoch": 0.4426084390675716, "grad_norm": 0.3461029989127889, "learning_rate": 4.736612021857923e-05, "loss": 0.6181, "num_tokens": 227835229.0, "step": 1500 }, { "epoch": 0.4440838005311301, "grad_norm": 0.32731373065748276, "learning_rate": 4.7338797814207656e-05, "loss": 0.685, "num_tokens": 228545467.0, "step": 1505 }, { "epoch": 0.4455591619946887, "grad_norm": 0.305773858931596, "learning_rate": 4.731147540983607e-05, "loss": 0.6451, "num_tokens": 229293209.0, "step": 1510 }, { "epoch": 0.4470345234582473, "grad_norm": 0.3052230540438597, "learning_rate": 4.728415300546448e-05, "loss": 0.6331, "num_tokens": 230069371.0, "step": 1515 }, { "epoch": 0.4485098849218058, "grad_norm": 0.31694776793268364, "learning_rate": 4.72568306010929e-05, "loss": 0.6092, "num_tokens": 230778745.0, "step": 1520 }, { "epoch": 0.4499852463853644, "grad_norm": 0.3141529340667305, "learning_rate": 4.722950819672131e-05, "loss": 0.6209, "num_tokens": 231516989.0, "step": 1525 }, { "epoch": 0.451460607848923, "grad_norm": 0.29109456863790206, "learning_rate": 4.7202185792349726e-05, "loss": 0.616, "num_tokens": 232236281.0, "step": 1530 }, { "epoch": 0.45293596931248153, "grad_norm": 0.30698677579816214, "learning_rate": 4.717486338797815e-05, "loss": 0.6507, "num_tokens": 233047927.0, "step": 1535 }, { "epoch": 0.4544113307760401, "grad_norm": 0.2849787918182056, "learning_rate": 4.714754098360656e-05, "loss": 0.6248, "num_tokens": 233867034.0, "step": 1540 }, { "epoch": 0.4558866922395987, "grad_norm": 0.30678119580639907, "learning_rate": 4.7120218579234976e-05, "loss": 0.6352, "num_tokens": 234618193.0, "step": 1545 }, { "epoch": 0.4573620537031573, "grad_norm": 0.3037248775103274, "learning_rate": 4.7092896174863386e-05, "loss": 0.6338, "num_tokens": 235364061.0, "step": 1550 }, { "epoch": 0.45883741516671583, "grad_norm": 0.3180177604667606, "learning_rate": 4.70655737704918e-05, "loss": 0.6287, "num_tokens": 236067555.0, "step": 1555 }, { "epoch": 0.4603127766302744, "grad_norm": 0.345481460282226, "learning_rate": 4.7038251366120226e-05, "loss": 0.6484, "num_tokens": 236835970.0, "step": 1560 }, { "epoch": 0.461788138093833, "grad_norm": 0.27715822494870535, "learning_rate": 4.7010928961748636e-05, "loss": 0.634, "num_tokens": 237614706.0, "step": 1565 }, { "epoch": 0.46326349955739154, "grad_norm": 0.3101855882969245, "learning_rate": 4.698360655737705e-05, "loss": 0.6288, "num_tokens": 238391459.0, "step": 1570 }, { "epoch": 0.46473886102095013, "grad_norm": 0.3083089625991187, "learning_rate": 4.695628415300546e-05, "loss": 0.6131, "num_tokens": 239114628.0, "step": 1575 }, { "epoch": 0.4662142224845087, "grad_norm": 0.33374900728698237, "learning_rate": 4.692896174863388e-05, "loss": 0.6391, "num_tokens": 239891162.0, "step": 1580 }, { "epoch": 0.46768958394806726, "grad_norm": 0.3223807157611013, "learning_rate": 4.6901639344262296e-05, "loss": 0.6329, "num_tokens": 240595575.0, "step": 1585 }, { "epoch": 0.46916494541162584, "grad_norm": 0.2913651476600706, "learning_rate": 4.687431693989071e-05, "loss": 0.6146, "num_tokens": 241323306.0, "step": 1590 }, { "epoch": 0.47064030687518443, "grad_norm": 0.30001018883705205, "learning_rate": 4.684699453551913e-05, "loss": 0.6248, "num_tokens": 242117767.0, "step": 1595 }, { "epoch": 0.47211566833874297, "grad_norm": 0.30159161245764887, "learning_rate": 4.681967213114754e-05, "loss": 0.6462, "num_tokens": 242924911.0, "step": 1600 }, { "epoch": 0.47359102980230156, "grad_norm": 0.319473802415587, "learning_rate": 4.6792349726775956e-05, "loss": 0.6431, "num_tokens": 243638400.0, "step": 1605 }, { "epoch": 0.47506639126586014, "grad_norm": 0.3503821975610931, "learning_rate": 4.676502732240437e-05, "loss": 0.6144, "num_tokens": 244355702.0, "step": 1610 }, { "epoch": 0.47654175272941873, "grad_norm": 0.2949418076524757, "learning_rate": 4.673770491803279e-05, "loss": 0.6425, "num_tokens": 245189624.0, "step": 1615 }, { "epoch": 0.47801711419297727, "grad_norm": 0.3318562259642869, "learning_rate": 4.6710382513661206e-05, "loss": 0.635, "num_tokens": 245919606.0, "step": 1620 }, { "epoch": 0.47949247565653585, "grad_norm": 0.3088685820034644, "learning_rate": 4.668306010928962e-05, "loss": 0.6283, "num_tokens": 246638095.0, "step": 1625 }, { "epoch": 0.48096783712009444, "grad_norm": 0.3541432554061423, "learning_rate": 4.665573770491803e-05, "loss": 0.6336, "num_tokens": 247359804.0, "step": 1630 }, { "epoch": 0.482443198583653, "grad_norm": 0.28321923733809673, "learning_rate": 4.662841530054645e-05, "loss": 0.6333, "num_tokens": 248168323.0, "step": 1635 }, { "epoch": 0.48391856004721157, "grad_norm": 0.30767002234744906, "learning_rate": 4.6601092896174866e-05, "loss": 0.6237, "num_tokens": 248946425.0, "step": 1640 }, { "epoch": 0.48539392151077015, "grad_norm": 0.2981172203483414, "learning_rate": 4.657377049180328e-05, "loss": 0.6394, "num_tokens": 249723328.0, "step": 1645 }, { "epoch": 0.4868692829743287, "grad_norm": 0.3090235886815844, "learning_rate": 4.65464480874317e-05, "loss": 0.6097, "num_tokens": 250447431.0, "step": 1650 }, { "epoch": 0.4883446444378873, "grad_norm": 0.39567797577168917, "learning_rate": 4.651912568306011e-05, "loss": 0.6388, "num_tokens": 251221392.0, "step": 1655 }, { "epoch": 0.48982000590144587, "grad_norm": 0.33081459100728455, "learning_rate": 4.6491803278688526e-05, "loss": 0.6155, "num_tokens": 251931875.0, "step": 1660 }, { "epoch": 0.4912953673650044, "grad_norm": 0.3347936264690626, "learning_rate": 4.646448087431694e-05, "loss": 0.6167, "num_tokens": 252726886.0, "step": 1665 }, { "epoch": 0.492770728828563, "grad_norm": 0.30203547605279085, "learning_rate": 4.643715846994536e-05, "loss": 0.6255, "num_tokens": 253498875.0, "step": 1670 }, { "epoch": 0.4942460902921216, "grad_norm": 0.36292308158087017, "learning_rate": 4.6409836065573776e-05, "loss": 0.6239, "num_tokens": 254200828.0, "step": 1675 }, { "epoch": 0.49572145175568016, "grad_norm": 0.3121557937428645, "learning_rate": 4.6382513661202186e-05, "loss": 0.6321, "num_tokens": 255026673.0, "step": 1680 }, { "epoch": 0.4971968132192387, "grad_norm": 0.3022225771516257, "learning_rate": 4.63551912568306e-05, "loss": 0.6254, "num_tokens": 255776096.0, "step": 1685 }, { "epoch": 0.4986721746827973, "grad_norm": 0.3076411082955518, "learning_rate": 4.632786885245902e-05, "loss": 0.6164, "num_tokens": 256537711.0, "step": 1690 }, { "epoch": 0.5001475361463559, "grad_norm": 0.31102208296282224, "learning_rate": 4.6300546448087437e-05, "loss": 0.6364, "num_tokens": 257305592.0, "step": 1695 }, { "epoch": 0.5016228976099144, "grad_norm": 0.31023975974802814, "learning_rate": 4.627322404371585e-05, "loss": 0.6178, "num_tokens": 258066828.0, "step": 1700 }, { "epoch": 0.503098259073473, "grad_norm": 0.5711572509623039, "learning_rate": 4.624590163934426e-05, "loss": 0.6316, "num_tokens": 258845289.0, "step": 1705 }, { "epoch": 0.5045736205370316, "grad_norm": 0.2801791295554478, "learning_rate": 4.621857923497268e-05, "loss": 0.6278, "num_tokens": 259566771.0, "step": 1710 }, { "epoch": 0.5060489820005901, "grad_norm": 0.3099099024371325, "learning_rate": 4.619125683060109e-05, "loss": 0.6002, "num_tokens": 260270032.0, "step": 1715 }, { "epoch": 0.5075243434641488, "grad_norm": 0.2987656726046885, "learning_rate": 4.616393442622951e-05, "loss": 0.6232, "num_tokens": 261055648.0, "step": 1720 }, { "epoch": 0.5089997049277073, "grad_norm": 0.2756258822201766, "learning_rate": 4.613661202185793e-05, "loss": 0.6056, "num_tokens": 261825949.0, "step": 1725 }, { "epoch": 0.5104750663912658, "grad_norm": 0.3519329049971471, "learning_rate": 4.610928961748634e-05, "loss": 0.6156, "num_tokens": 262543221.0, "step": 1730 }, { "epoch": 0.5119504278548245, "grad_norm": 0.3084698834252734, "learning_rate": 4.608196721311476e-05, "loss": 0.6603, "num_tokens": 263319654.0, "step": 1735 }, { "epoch": 0.513425789318383, "grad_norm": 0.29529787605471486, "learning_rate": 4.605464480874317e-05, "loss": 0.6332, "num_tokens": 264107471.0, "step": 1740 }, { "epoch": 0.5149011507819415, "grad_norm": 0.32780193529560303, "learning_rate": 4.602732240437158e-05, "loss": 0.637, "num_tokens": 264869536.0, "step": 1745 }, { "epoch": 0.5163765122455002, "grad_norm": 0.3231225472975876, "learning_rate": 4.600000000000001e-05, "loss": 0.605, "num_tokens": 265596302.0, "step": 1750 }, { "epoch": 0.5178518737090587, "grad_norm": 0.3127490401041541, "learning_rate": 4.597267759562842e-05, "loss": 0.6192, "num_tokens": 266389025.0, "step": 1755 }, { "epoch": 0.5193272351726173, "grad_norm": 0.30346044067365996, "learning_rate": 4.5945355191256833e-05, "loss": 0.6081, "num_tokens": 267082157.0, "step": 1760 }, { "epoch": 0.5208025966361759, "grad_norm": 0.30036105555618087, "learning_rate": 4.5918032786885243e-05, "loss": 0.6324, "num_tokens": 267876677.0, "step": 1765 }, { "epoch": 0.5222779580997344, "grad_norm": 0.29978029102077564, "learning_rate": 4.589071038251366e-05, "loss": 0.622, "num_tokens": 268637502.0, "step": 1770 }, { "epoch": 0.523753319563293, "grad_norm": 0.29381829837631, "learning_rate": 4.5863387978142084e-05, "loss": 0.6163, "num_tokens": 269441474.0, "step": 1775 }, { "epoch": 0.5252286810268516, "grad_norm": 0.2965525994617663, "learning_rate": 4.5836065573770494e-05, "loss": 0.6204, "num_tokens": 270200835.0, "step": 1780 }, { "epoch": 0.5267040424904101, "grad_norm": 0.3143913482839287, "learning_rate": 4.580874316939891e-05, "loss": 0.5938, "num_tokens": 270907474.0, "step": 1785 }, { "epoch": 0.5281794039539687, "grad_norm": 0.3004951828607307, "learning_rate": 4.578142076502732e-05, "loss": 0.6187, "num_tokens": 271716836.0, "step": 1790 }, { "epoch": 0.5296547654175273, "grad_norm": 0.32222351224584844, "learning_rate": 4.575409836065574e-05, "loss": 0.6122, "num_tokens": 272419058.0, "step": 1795 }, { "epoch": 0.5311301268810859, "grad_norm": 0.29810121918293264, "learning_rate": 4.5726775956284154e-05, "loss": 0.6128, "num_tokens": 273144307.0, "step": 1800 }, { "epoch": 0.5326054883446444, "grad_norm": 0.3043985368377827, "learning_rate": 4.569945355191257e-05, "loss": 0.6236, "num_tokens": 273901404.0, "step": 1805 }, { "epoch": 0.534080849808203, "grad_norm": 0.2929061444207432, "learning_rate": 4.567213114754099e-05, "loss": 0.596, "num_tokens": 274650497.0, "step": 1810 }, { "epoch": 0.5355562112717616, "grad_norm": 0.30388569194113624, "learning_rate": 4.5644808743169404e-05, "loss": 0.6061, "num_tokens": 275438288.0, "step": 1815 }, { "epoch": 0.5370315727353202, "grad_norm": 0.29663778162290083, "learning_rate": 4.5617486338797814e-05, "loss": 0.624, "num_tokens": 276181128.0, "step": 1820 }, { "epoch": 0.5385069341988787, "grad_norm": 0.3157984983023545, "learning_rate": 4.559016393442623e-05, "loss": 0.6136, "num_tokens": 276902371.0, "step": 1825 }, { "epoch": 0.5399822956624373, "grad_norm": 0.2814549417795669, "learning_rate": 4.556284153005465e-05, "loss": 0.5951, "num_tokens": 277649965.0, "step": 1830 }, { "epoch": 0.5414576571259959, "grad_norm": 0.32519077594836016, "learning_rate": 4.5535519125683064e-05, "loss": 0.625, "num_tokens": 278347214.0, "step": 1835 }, { "epoch": 0.5429330185895545, "grad_norm": 0.28657307166778506, "learning_rate": 4.550819672131148e-05, "loss": 0.6165, "num_tokens": 279132887.0, "step": 1840 }, { "epoch": 0.544408380053113, "grad_norm": 0.3083955023925146, "learning_rate": 4.548087431693989e-05, "loss": 0.584, "num_tokens": 279843610.0, "step": 1845 }, { "epoch": 0.5458837415166716, "grad_norm": 0.2906031112909524, "learning_rate": 4.545355191256831e-05, "loss": 0.6042, "num_tokens": 280608357.0, "step": 1850 }, { "epoch": 0.5473591029802302, "grad_norm": 0.2812616785787168, "learning_rate": 4.5426229508196724e-05, "loss": 0.6305, "num_tokens": 281420094.0, "step": 1855 }, { "epoch": 0.5488344644437887, "grad_norm": 0.32754957856702743, "learning_rate": 4.539890710382514e-05, "loss": 0.6166, "num_tokens": 282155234.0, "step": 1860 }, { "epoch": 0.5503098259073473, "grad_norm": 0.3133227583971586, "learning_rate": 4.537158469945356e-05, "loss": 0.621, "num_tokens": 282925712.0, "step": 1865 }, { "epoch": 0.5517851873709059, "grad_norm": 0.27753273133069517, "learning_rate": 4.534426229508197e-05, "loss": 0.6029, "num_tokens": 283704005.0, "step": 1870 }, { "epoch": 0.5532605488344644, "grad_norm": 0.29618835574067537, "learning_rate": 4.5316939890710384e-05, "loss": 0.6217, "num_tokens": 284505935.0, "step": 1875 }, { "epoch": 0.554735910298023, "grad_norm": 0.2865544511419171, "learning_rate": 4.52896174863388e-05, "loss": 0.6086, "num_tokens": 285288238.0, "step": 1880 }, { "epoch": 0.5562112717615816, "grad_norm": 0.3113285467883617, "learning_rate": 4.526229508196722e-05, "loss": 0.6141, "num_tokens": 286018216.0, "step": 1885 }, { "epoch": 0.5576866332251401, "grad_norm": 0.3235619187675825, "learning_rate": 4.5234972677595634e-05, "loss": 0.5971, "num_tokens": 286722250.0, "step": 1890 }, { "epoch": 0.5591619946886988, "grad_norm": 0.2986726624952025, "learning_rate": 4.5207650273224044e-05, "loss": 0.6263, "num_tokens": 287448059.0, "step": 1895 }, { "epoch": 0.5606373561522573, "grad_norm": 0.28433382298675564, "learning_rate": 4.518032786885246e-05, "loss": 0.6242, "num_tokens": 288258207.0, "step": 1900 }, { "epoch": 0.5621127176158158, "grad_norm": 0.28187362765940055, "learning_rate": 4.515300546448088e-05, "loss": 0.6187, "num_tokens": 289051085.0, "step": 1905 }, { "epoch": 0.5635880790793745, "grad_norm": 0.32691229970569147, "learning_rate": 4.5125683060109294e-05, "loss": 0.5968, "num_tokens": 289772529.0, "step": 1910 }, { "epoch": 0.565063440542933, "grad_norm": 0.28325870967157857, "learning_rate": 4.509836065573771e-05, "loss": 0.6001, "num_tokens": 290569038.0, "step": 1915 }, { "epoch": 0.5665388020064915, "grad_norm": 0.2962579137755696, "learning_rate": 4.507103825136612e-05, "loss": 0.6051, "num_tokens": 291319974.0, "step": 1920 }, { "epoch": 0.5680141634700502, "grad_norm": 0.9129449810612286, "learning_rate": 4.504371584699454e-05, "loss": 0.5955, "num_tokens": 292073566.0, "step": 1925 }, { "epoch": 0.5694895249336087, "grad_norm": 0.3001712486206535, "learning_rate": 4.501639344262295e-05, "loss": 0.5915, "num_tokens": 292786762.0, "step": 1930 }, { "epoch": 0.5709648863971674, "grad_norm": 0.2904032947884006, "learning_rate": 4.498907103825137e-05, "loss": 0.5863, "num_tokens": 293567441.0, "step": 1935 }, { "epoch": 0.5724402478607259, "grad_norm": 0.30327890482883557, "learning_rate": 4.496174863387979e-05, "loss": 0.5886, "num_tokens": 294242942.0, "step": 1940 }, { "epoch": 0.5739156093242844, "grad_norm": 0.2952367144783484, "learning_rate": 4.49344262295082e-05, "loss": 0.6367, "num_tokens": 295090132.0, "step": 1945 }, { "epoch": 0.5753909707878431, "grad_norm": 0.3094963217641471, "learning_rate": 4.4907103825136614e-05, "loss": 0.612, "num_tokens": 295847451.0, "step": 1950 }, { "epoch": 0.5768663322514016, "grad_norm": 0.28148925260100033, "learning_rate": 4.4879781420765024e-05, "loss": 0.604, "num_tokens": 296604736.0, "step": 1955 }, { "epoch": 0.5783416937149601, "grad_norm": 0.3734259404897707, "learning_rate": 4.485245901639345e-05, "loss": 0.5922, "num_tokens": 297331629.0, "step": 1960 }, { "epoch": 0.5798170551785188, "grad_norm": 0.31647780968196304, "learning_rate": 4.4825136612021864e-05, "loss": 0.5952, "num_tokens": 298078276.0, "step": 1965 }, { "epoch": 0.5812924166420773, "grad_norm": 0.28894456032832694, "learning_rate": 4.4797814207650274e-05, "loss": 0.6024, "num_tokens": 298871473.0, "step": 1970 }, { "epoch": 0.5827677781056358, "grad_norm": 0.30435248991595576, "learning_rate": 4.477049180327869e-05, "loss": 0.5983, "num_tokens": 299576317.0, "step": 1975 }, { "epoch": 0.5842431395691945, "grad_norm": 0.28708586675329567, "learning_rate": 4.47431693989071e-05, "loss": 0.5965, "num_tokens": 300287273.0, "step": 1980 }, { "epoch": 0.585718501032753, "grad_norm": 0.31546630693121064, "learning_rate": 4.471584699453552e-05, "loss": 0.6196, "num_tokens": 301106692.0, "step": 1985 }, { "epoch": 0.5871938624963116, "grad_norm": 0.29411232472081406, "learning_rate": 4.468852459016394e-05, "loss": 0.621, "num_tokens": 301878003.0, "step": 1990 }, { "epoch": 0.5886692239598702, "grad_norm": 0.2968095877467125, "learning_rate": 4.466120218579235e-05, "loss": 0.5928, "num_tokens": 302583341.0, "step": 1995 }, { "epoch": 0.5901445854234287, "grad_norm": 0.30122367321164023, "learning_rate": 4.463387978142077e-05, "loss": 0.6147, "num_tokens": 303379274.0, "step": 2000 }, { "epoch": 0.5916199468869873, "grad_norm": 0.29491509970584284, "learning_rate": 4.4606557377049184e-05, "loss": 0.6025, "num_tokens": 304162371.0, "step": 2005 }, { "epoch": 0.5930953083505459, "grad_norm": 0.2572647773057109, "learning_rate": 4.4579234972677594e-05, "loss": 0.6092, "num_tokens": 304934786.0, "step": 2010 }, { "epoch": 0.5945706698141044, "grad_norm": 0.2800405738284262, "learning_rate": 4.455191256830602e-05, "loss": 0.5964, "num_tokens": 305666208.0, "step": 2015 }, { "epoch": 0.596046031277663, "grad_norm": 0.3159443388265349, "learning_rate": 4.452459016393443e-05, "loss": 0.5874, "num_tokens": 306436031.0, "step": 2020 }, { "epoch": 0.5975213927412216, "grad_norm": 0.33047618466764195, "learning_rate": 4.4497267759562845e-05, "loss": 0.6244, "num_tokens": 307166132.0, "step": 2025 }, { "epoch": 0.5989967542047802, "grad_norm": 0.2881455734088398, "learning_rate": 4.446994535519126e-05, "loss": 0.6153, "num_tokens": 307983900.0, "step": 2030 }, { "epoch": 0.6004721156683387, "grad_norm": 0.390551302902322, "learning_rate": 4.444262295081967e-05, "loss": 0.5851, "num_tokens": 308669916.0, "step": 2035 }, { "epoch": 0.6019474771318973, "grad_norm": 0.33162729234263505, "learning_rate": 4.441530054644809e-05, "loss": 0.6092, "num_tokens": 309436666.0, "step": 2040 }, { "epoch": 0.6034228385954559, "grad_norm": 0.29808057445258723, "learning_rate": 4.4387978142076505e-05, "loss": 0.5877, "num_tokens": 310211921.0, "step": 2045 }, { "epoch": 0.6048982000590145, "grad_norm": 0.3027637651615133, "learning_rate": 4.436065573770492e-05, "loss": 0.6046, "num_tokens": 310911002.0, "step": 2050 }, { "epoch": 0.606373561522573, "grad_norm": 0.29848625268778217, "learning_rate": 4.433333333333334e-05, "loss": 0.5936, "num_tokens": 311652243.0, "step": 2055 }, { "epoch": 0.6078489229861316, "grad_norm": 0.2731300347384377, "learning_rate": 4.430601092896175e-05, "loss": 0.6094, "num_tokens": 312422011.0, "step": 2060 }, { "epoch": 0.6093242844496902, "grad_norm": 0.3416196091954726, "learning_rate": 4.4278688524590165e-05, "loss": 0.6128, "num_tokens": 313151516.0, "step": 2065 }, { "epoch": 0.6107996459132488, "grad_norm": 0.2687947780458006, "learning_rate": 4.425136612021858e-05, "loss": 0.5861, "num_tokens": 313871462.0, "step": 2070 }, { "epoch": 0.6122750073768073, "grad_norm": 0.27991298597867303, "learning_rate": 4.4224043715847e-05, "loss": 0.6101, "num_tokens": 314628297.0, "step": 2075 }, { "epoch": 0.6137503688403659, "grad_norm": 0.3384364218678626, "learning_rate": 4.4196721311475415e-05, "loss": 0.6091, "num_tokens": 315340424.0, "step": 2080 }, { "epoch": 0.6152257303039245, "grad_norm": 0.2651121589507928, "learning_rate": 4.4169398907103825e-05, "loss": 0.6049, "num_tokens": 316160525.0, "step": 2085 }, { "epoch": 0.616701091767483, "grad_norm": 0.2853608500658328, "learning_rate": 4.414207650273224e-05, "loss": 0.5865, "num_tokens": 316923087.0, "step": 2090 }, { "epoch": 0.6181764532310416, "grad_norm": 0.2906936621538619, "learning_rate": 4.411475409836066e-05, "loss": 0.5896, "num_tokens": 317657946.0, "step": 2095 }, { "epoch": 0.6196518146946002, "grad_norm": 0.314722428018079, "learning_rate": 4.4087431693989075e-05, "loss": 0.6064, "num_tokens": 318461600.0, "step": 2100 }, { "epoch": 0.6211271761581587, "grad_norm": 0.294042832413386, "learning_rate": 4.406010928961749e-05, "loss": 0.5845, "num_tokens": 319183147.0, "step": 2105 }, { "epoch": 0.6226025376217174, "grad_norm": 0.2983401191704106, "learning_rate": 4.40327868852459e-05, "loss": 0.6016, "num_tokens": 319959993.0, "step": 2110 }, { "epoch": 0.6240778990852759, "grad_norm": 0.28955650147335477, "learning_rate": 4.400546448087432e-05, "loss": 0.5762, "num_tokens": 320599496.0, "step": 2115 }, { "epoch": 0.6255532605488344, "grad_norm": 0.255888498684002, "learning_rate": 4.3978142076502735e-05, "loss": 0.6025, "num_tokens": 321395112.0, "step": 2120 }, { "epoch": 0.6270286220123931, "grad_norm": 0.2611900988870063, "learning_rate": 4.395081967213115e-05, "loss": 0.5852, "num_tokens": 322134680.0, "step": 2125 }, { "epoch": 0.6285039834759516, "grad_norm": 0.27512479999971856, "learning_rate": 4.392349726775957e-05, "loss": 0.5934, "num_tokens": 322918878.0, "step": 2130 }, { "epoch": 0.6299793449395101, "grad_norm": 0.30221379470458704, "learning_rate": 4.389617486338798e-05, "loss": 0.5978, "num_tokens": 323689349.0, "step": 2135 }, { "epoch": 0.6314547064030688, "grad_norm": 0.2848412594378135, "learning_rate": 4.3868852459016395e-05, "loss": 0.5968, "num_tokens": 324471263.0, "step": 2140 }, { "epoch": 0.6329300678666273, "grad_norm": 0.277430919165157, "learning_rate": 4.3841530054644805e-05, "loss": 0.6035, "num_tokens": 325258360.0, "step": 2145 }, { "epoch": 0.6344054293301858, "grad_norm": 0.2818599194838905, "learning_rate": 4.381420765027323e-05, "loss": 0.5917, "num_tokens": 326009269.0, "step": 2150 }, { "epoch": 0.6358807907937445, "grad_norm": 0.26993594630392087, "learning_rate": 4.3786885245901645e-05, "loss": 0.6064, "num_tokens": 326824657.0, "step": 2155 }, { "epoch": 0.637356152257303, "grad_norm": 0.27205185830042733, "learning_rate": 4.3759562841530055e-05, "loss": 0.5963, "num_tokens": 327667790.0, "step": 2160 }, { "epoch": 0.6388315137208617, "grad_norm": 0.28735100976850003, "learning_rate": 4.373224043715847e-05, "loss": 0.5891, "num_tokens": 328434447.0, "step": 2165 }, { "epoch": 0.6403068751844202, "grad_norm": 0.27453549445675307, "learning_rate": 4.370491803278688e-05, "loss": 0.5815, "num_tokens": 329182302.0, "step": 2170 }, { "epoch": 0.6417822366479787, "grad_norm": 0.2960112664359568, "learning_rate": 4.3677595628415305e-05, "loss": 0.5954, "num_tokens": 329884398.0, "step": 2175 }, { "epoch": 0.6432575981115374, "grad_norm": 0.26597181402195463, "learning_rate": 4.365027322404372e-05, "loss": 0.5842, "num_tokens": 330634441.0, "step": 2180 }, { "epoch": 0.6447329595750959, "grad_norm": 0.27827992366619586, "learning_rate": 4.362295081967213e-05, "loss": 0.5955, "num_tokens": 331410158.0, "step": 2185 }, { "epoch": 0.6462083210386544, "grad_norm": 0.2861265787380432, "learning_rate": 4.359562841530055e-05, "loss": 0.5797, "num_tokens": 332111942.0, "step": 2190 }, { "epoch": 0.6476836825022131, "grad_norm": 0.2941521306784061, "learning_rate": 4.3568306010928965e-05, "loss": 0.5747, "num_tokens": 332880959.0, "step": 2195 }, { "epoch": 0.6491590439657716, "grad_norm": 0.3127103633589907, "learning_rate": 4.3540983606557375e-05, "loss": 0.5977, "num_tokens": 333580436.0, "step": 2200 }, { "epoch": 0.6506344054293302, "grad_norm": 0.28584789087998813, "learning_rate": 4.35136612021858e-05, "loss": 0.5766, "num_tokens": 334327442.0, "step": 2205 }, { "epoch": 0.6521097668928888, "grad_norm": 0.28361162163114884, "learning_rate": 4.348633879781421e-05, "loss": 0.6, "num_tokens": 335099480.0, "step": 2210 }, { "epoch": 0.6535851283564473, "grad_norm": 0.2810039904813458, "learning_rate": 4.3459016393442625e-05, "loss": 0.6157, "num_tokens": 335881181.0, "step": 2215 }, { "epoch": 0.6550604898200059, "grad_norm": 0.27790740932808256, "learning_rate": 4.343169398907104e-05, "loss": 0.5977, "num_tokens": 336635215.0, "step": 2220 }, { "epoch": 0.6565358512835645, "grad_norm": 0.2789168410551777, "learning_rate": 4.340437158469945e-05, "loss": 0.5989, "num_tokens": 337377657.0, "step": 2225 }, { "epoch": 0.658011212747123, "grad_norm": 0.30919189874888087, "learning_rate": 4.3377049180327875e-05, "loss": 0.6072, "num_tokens": 338092336.0, "step": 2230 }, { "epoch": 0.6594865742106816, "grad_norm": 0.29666664758782096, "learning_rate": 4.3349726775956285e-05, "loss": 0.612, "num_tokens": 338852179.0, "step": 2235 }, { "epoch": 0.6609619356742402, "grad_norm": 0.285938024711431, "learning_rate": 4.33224043715847e-05, "loss": 0.5882, "num_tokens": 339561744.0, "step": 2240 }, { "epoch": 0.6624372971377988, "grad_norm": 0.3154800872268964, "learning_rate": 4.329508196721312e-05, "loss": 0.5864, "num_tokens": 340315569.0, "step": 2245 }, { "epoch": 0.6639126586013573, "grad_norm": 0.2806431688598427, "learning_rate": 4.326775956284153e-05, "loss": 0.5919, "num_tokens": 341130372.0, "step": 2250 }, { "epoch": 0.6653880200649159, "grad_norm": 0.2698439187943593, "learning_rate": 4.3240437158469945e-05, "loss": 0.5905, "num_tokens": 341889656.0, "step": 2255 }, { "epoch": 0.6668633815284745, "grad_norm": 0.2525405075180738, "learning_rate": 4.321311475409836e-05, "loss": 0.602, "num_tokens": 342662114.0, "step": 2260 }, { "epoch": 0.668338742992033, "grad_norm": 0.3031167746383424, "learning_rate": 4.318579234972678e-05, "loss": 0.5762, "num_tokens": 343426969.0, "step": 2265 }, { "epoch": 0.6698141044555916, "grad_norm": 0.27832935016301574, "learning_rate": 4.3158469945355196e-05, "loss": 0.6143, "num_tokens": 344208786.0, "step": 2270 }, { "epoch": 0.6712894659191502, "grad_norm": 0.3153301775088319, "learning_rate": 4.3131147540983606e-05, "loss": 0.5949, "num_tokens": 344918774.0, "step": 2275 }, { "epoch": 0.6727648273827088, "grad_norm": 0.2892226967842947, "learning_rate": 4.310382513661202e-05, "loss": 0.5991, "num_tokens": 345698730.0, "step": 2280 }, { "epoch": 0.6742401888462674, "grad_norm": 0.3004720162714327, "learning_rate": 4.307650273224044e-05, "loss": 0.5976, "num_tokens": 346433442.0, "step": 2285 }, { "epoch": 0.6757155503098259, "grad_norm": 0.27762012378001005, "learning_rate": 4.3049180327868856e-05, "loss": 0.5993, "num_tokens": 347218622.0, "step": 2290 }, { "epoch": 0.6771909117733845, "grad_norm": 0.30075371214796914, "learning_rate": 4.302185792349727e-05, "loss": 0.5782, "num_tokens": 347957535.0, "step": 2295 }, { "epoch": 0.6786662732369431, "grad_norm": 0.316030715974753, "learning_rate": 4.299453551912568e-05, "loss": 0.6069, "num_tokens": 348687205.0, "step": 2300 }, { "epoch": 0.6801416347005016, "grad_norm": 0.28655895210608884, "learning_rate": 4.29672131147541e-05, "loss": 0.5903, "num_tokens": 349451685.0, "step": 2305 }, { "epoch": 0.6816169961640602, "grad_norm": 0.33180640651858617, "learning_rate": 4.2939890710382516e-05, "loss": 0.5815, "num_tokens": 350222603.0, "step": 2310 }, { "epoch": 0.6830923576276188, "grad_norm": 0.29337869110726417, "learning_rate": 4.291256830601093e-05, "loss": 0.5916, "num_tokens": 350998727.0, "step": 2315 }, { "epoch": 0.6845677190911773, "grad_norm": 0.30160017693791896, "learning_rate": 4.288524590163935e-05, "loss": 0.5714, "num_tokens": 351711973.0, "step": 2320 }, { "epoch": 0.686043080554736, "grad_norm": 0.2911375002994324, "learning_rate": 4.285792349726776e-05, "loss": 0.5859, "num_tokens": 352442563.0, "step": 2325 }, { "epoch": 0.6875184420182945, "grad_norm": 0.3206662039930908, "learning_rate": 4.2830601092896176e-05, "loss": 0.5672, "num_tokens": 353137988.0, "step": 2330 }, { "epoch": 0.688993803481853, "grad_norm": 0.29838005977513515, "learning_rate": 4.280327868852459e-05, "loss": 0.5919, "num_tokens": 353859209.0, "step": 2335 }, { "epoch": 0.6904691649454117, "grad_norm": 0.2867755916523669, "learning_rate": 4.277595628415301e-05, "loss": 0.5895, "num_tokens": 354587099.0, "step": 2340 }, { "epoch": 0.6919445264089702, "grad_norm": 0.3007975904686551, "learning_rate": 4.2748633879781426e-05, "loss": 0.5906, "num_tokens": 355322508.0, "step": 2345 }, { "epoch": 0.6934198878725287, "grad_norm": 0.28858216282257615, "learning_rate": 4.2721311475409836e-05, "loss": 0.5882, "num_tokens": 356075848.0, "step": 2350 }, { "epoch": 0.6948952493360874, "grad_norm": 0.316394020147214, "learning_rate": 4.269398907103825e-05, "loss": 0.591, "num_tokens": 356835970.0, "step": 2355 }, { "epoch": 0.6963706107996459, "grad_norm": 0.28157014138277664, "learning_rate": 4.266666666666667e-05, "loss": 0.5991, "num_tokens": 357550660.0, "step": 2360 }, { "epoch": 0.6978459722632044, "grad_norm": 0.2691758903585563, "learning_rate": 4.2639344262295086e-05, "loss": 0.5936, "num_tokens": 358270859.0, "step": 2365 }, { "epoch": 0.6993213337267631, "grad_norm": 0.2967457048310515, "learning_rate": 4.26120218579235e-05, "loss": 0.5818, "num_tokens": 359046324.0, "step": 2370 }, { "epoch": 0.7007966951903216, "grad_norm": 0.3142896339634803, "learning_rate": 4.258469945355191e-05, "loss": 0.5685, "num_tokens": 359715917.0, "step": 2375 }, { "epoch": 0.7022720566538802, "grad_norm": 0.3002621537984152, "learning_rate": 4.255737704918033e-05, "loss": 0.5961, "num_tokens": 360472963.0, "step": 2380 }, { "epoch": 0.7037474181174388, "grad_norm": 0.28808918303765, "learning_rate": 4.2530054644808746e-05, "loss": 0.5731, "num_tokens": 361242072.0, "step": 2385 }, { "epoch": 0.7052227795809973, "grad_norm": 0.2836419977780502, "learning_rate": 4.250273224043716e-05, "loss": 0.6051, "num_tokens": 362036658.0, "step": 2390 }, { "epoch": 0.7066981410445559, "grad_norm": 0.2891957659725607, "learning_rate": 4.247540983606558e-05, "loss": 0.5819, "num_tokens": 362776362.0, "step": 2395 }, { "epoch": 0.7081735025081145, "grad_norm": 0.297027349339251, "learning_rate": 4.244808743169399e-05, "loss": 0.5975, "num_tokens": 363531111.0, "step": 2400 }, { "epoch": 0.709648863971673, "grad_norm": 0.32741940235528677, "learning_rate": 4.2420765027322406e-05, "loss": 0.5818, "num_tokens": 364306834.0, "step": 2405 }, { "epoch": 0.7111242254352317, "grad_norm": 0.2774238619386101, "learning_rate": 4.239344262295082e-05, "loss": 0.5811, "num_tokens": 365125545.0, "step": 2410 }, { "epoch": 0.7125995868987902, "grad_norm": 0.30872544260596463, "learning_rate": 4.236612021857923e-05, "loss": 0.5709, "num_tokens": 365869820.0, "step": 2415 }, { "epoch": 0.7140749483623487, "grad_norm": 0.29302651138723124, "learning_rate": 4.2338797814207656e-05, "loss": 0.6005, "num_tokens": 366613758.0, "step": 2420 }, { "epoch": 0.7155503098259074, "grad_norm": 0.3027099707363095, "learning_rate": 4.2311475409836066e-05, "loss": 0.5998, "num_tokens": 367402717.0, "step": 2425 }, { "epoch": 0.7170256712894659, "grad_norm": 0.3006688151559092, "learning_rate": 4.228415300546448e-05, "loss": 0.5858, "num_tokens": 368178359.0, "step": 2430 }, { "epoch": 0.7185010327530245, "grad_norm": 0.27350447378699844, "learning_rate": 4.22568306010929e-05, "loss": 0.5795, "num_tokens": 368926269.0, "step": 2435 }, { "epoch": 0.7199763942165831, "grad_norm": 0.2965090807526827, "learning_rate": 4.222950819672131e-05, "loss": 0.5794, "num_tokens": 369751833.0, "step": 2440 }, { "epoch": 0.7214517556801416, "grad_norm": 0.3057139665742985, "learning_rate": 4.220218579234973e-05, "loss": 0.579, "num_tokens": 370453120.0, "step": 2445 }, { "epoch": 0.7229271171437002, "grad_norm": 0.27001570737817543, "learning_rate": 4.217486338797814e-05, "loss": 0.596, "num_tokens": 371234102.0, "step": 2450 }, { "epoch": 0.7244024786072588, "grad_norm": 0.2796524664043356, "learning_rate": 4.214754098360656e-05, "loss": 0.5947, "num_tokens": 372028638.0, "step": 2455 }, { "epoch": 0.7258778400708173, "grad_norm": 0.27042602181730996, "learning_rate": 4.2120218579234976e-05, "loss": 0.5813, "num_tokens": 372850721.0, "step": 2460 }, { "epoch": 0.7273532015343759, "grad_norm": 0.278405880583853, "learning_rate": 4.2092896174863386e-05, "loss": 0.5902, "num_tokens": 373614230.0, "step": 2465 }, { "epoch": 0.7288285629979345, "grad_norm": 0.26940616651487287, "learning_rate": 4.20655737704918e-05, "loss": 0.5992, "num_tokens": 374361568.0, "step": 2470 }, { "epoch": 0.7303039244614931, "grad_norm": 0.2619296280067162, "learning_rate": 4.203825136612022e-05, "loss": 0.5856, "num_tokens": 375173754.0, "step": 2475 }, { "epoch": 0.7317792859250516, "grad_norm": 0.24639776929271726, "learning_rate": 4.2010928961748636e-05, "loss": 0.5666, "num_tokens": 375938308.0, "step": 2480 }, { "epoch": 0.7332546473886102, "grad_norm": 0.26893280953543935, "learning_rate": 4.198360655737705e-05, "loss": 0.5698, "num_tokens": 376697153.0, "step": 2485 }, { "epoch": 0.7347300088521688, "grad_norm": 0.267628848514466, "learning_rate": 4.195628415300546e-05, "loss": 0.583, "num_tokens": 377476543.0, "step": 2490 }, { "epoch": 0.7362053703157273, "grad_norm": 0.2768063642048407, "learning_rate": 4.192896174863388e-05, "loss": 0.5953, "num_tokens": 378221524.0, "step": 2495 }, { "epoch": 0.737680731779286, "grad_norm": 0.2821625624802579, "learning_rate": 4.1901639344262296e-05, "loss": 0.5796, "num_tokens": 378991765.0, "step": 2500 }, { "epoch": 0.7391560932428445, "grad_norm": 0.2565934326485956, "learning_rate": 4.187431693989071e-05, "loss": 0.5827, "num_tokens": 379749829.0, "step": 2505 }, { "epoch": 0.740631454706403, "grad_norm": 0.2790855393551445, "learning_rate": 4.184699453551913e-05, "loss": 0.6004, "num_tokens": 380535299.0, "step": 2510 }, { "epoch": 0.7421068161699617, "grad_norm": 0.2907231020965004, "learning_rate": 4.181967213114754e-05, "loss": 0.574, "num_tokens": 381304327.0, "step": 2515 }, { "epoch": 0.7435821776335202, "grad_norm": 0.2599627306552256, "learning_rate": 4.1792349726775957e-05, "loss": 0.582, "num_tokens": 382073395.0, "step": 2520 }, { "epoch": 0.7450575390970788, "grad_norm": 0.30391995443080577, "learning_rate": 4.176502732240437e-05, "loss": 0.5747, "num_tokens": 382833518.0, "step": 2525 }, { "epoch": 0.7465329005606374, "grad_norm": 0.2685242327623912, "learning_rate": 4.173770491803279e-05, "loss": 0.588, "num_tokens": 383682807.0, "step": 2530 }, { "epoch": 0.7480082620241959, "grad_norm": 0.2716269380347485, "learning_rate": 4.171038251366121e-05, "loss": 0.5696, "num_tokens": 384355248.0, "step": 2535 }, { "epoch": 0.7494836234877545, "grad_norm": 0.33213240820630513, "learning_rate": 4.1683060109289617e-05, "loss": 0.5513, "num_tokens": 385065277.0, "step": 2540 }, { "epoch": 0.7509589849513131, "grad_norm": 0.27308442460179105, "learning_rate": 4.165573770491803e-05, "loss": 0.5809, "num_tokens": 385828256.0, "step": 2545 }, { "epoch": 0.7524343464148716, "grad_norm": 0.30404639083211144, "learning_rate": 4.162841530054645e-05, "loss": 0.5685, "num_tokens": 386512184.0, "step": 2550 }, { "epoch": 0.7539097078784303, "grad_norm": 0.302606134980511, "learning_rate": 4.160109289617487e-05, "loss": 0.5889, "num_tokens": 387280497.0, "step": 2555 }, { "epoch": 0.7553850693419888, "grad_norm": 0.30490218687508724, "learning_rate": 4.1573770491803283e-05, "loss": 0.5921, "num_tokens": 388018490.0, "step": 2560 }, { "epoch": 0.7568604308055473, "grad_norm": 0.28039457656722006, "learning_rate": 4.154644808743169e-05, "loss": 0.5813, "num_tokens": 388769530.0, "step": 2565 }, { "epoch": 0.758335792269106, "grad_norm": 0.28448027987795926, "learning_rate": 4.151912568306011e-05, "loss": 0.5798, "num_tokens": 389542550.0, "step": 2570 }, { "epoch": 0.7598111537326645, "grad_norm": 0.3108417221306342, "learning_rate": 4.149180327868853e-05, "loss": 0.5795, "num_tokens": 390301251.0, "step": 2575 }, { "epoch": 0.761286515196223, "grad_norm": 0.2887148610365919, "learning_rate": 4.1464480874316944e-05, "loss": 0.5725, "num_tokens": 391058826.0, "step": 2580 }, { "epoch": 0.7627618766597817, "grad_norm": 0.2899356117297813, "learning_rate": 4.143715846994536e-05, "loss": 0.5648, "num_tokens": 391779551.0, "step": 2585 }, { "epoch": 0.7642372381233402, "grad_norm": 0.2526699583199347, "learning_rate": 4.140983606557377e-05, "loss": 0.5721, "num_tokens": 392559096.0, "step": 2590 }, { "epoch": 0.7657125995868987, "grad_norm": 0.267290739302913, "learning_rate": 4.138251366120219e-05, "loss": 0.5651, "num_tokens": 393329389.0, "step": 2595 }, { "epoch": 0.7671879610504574, "grad_norm": 0.2998430859070577, "learning_rate": 4.1355191256830604e-05, "loss": 0.5648, "num_tokens": 394001889.0, "step": 2600 }, { "epoch": 0.7686633225140159, "grad_norm": 0.2812969870468317, "learning_rate": 4.132786885245902e-05, "loss": 0.5966, "num_tokens": 394799348.0, "step": 2605 }, { "epoch": 0.7701386839775745, "grad_norm": 0.30928451109265886, "learning_rate": 4.130054644808744e-05, "loss": 0.5803, "num_tokens": 395550301.0, "step": 2610 }, { "epoch": 0.7716140454411331, "grad_norm": 0.275273132944236, "learning_rate": 4.127322404371585e-05, "loss": 0.5827, "num_tokens": 396325461.0, "step": 2615 }, { "epoch": 0.7730894069046916, "grad_norm": 0.3021349948986392, "learning_rate": 4.1245901639344264e-05, "loss": 0.5788, "num_tokens": 397080569.0, "step": 2620 }, { "epoch": 0.7745647683682502, "grad_norm": 0.2894768480000615, "learning_rate": 4.121857923497268e-05, "loss": 0.5665, "num_tokens": 397812903.0, "step": 2625 }, { "epoch": 0.7760401298318088, "grad_norm": 0.30093383810050583, "learning_rate": 4.11912568306011e-05, "loss": 0.5811, "num_tokens": 398576641.0, "step": 2630 }, { "epoch": 0.7775154912953673, "grad_norm": 0.2782759990891234, "learning_rate": 4.1163934426229514e-05, "loss": 0.5838, "num_tokens": 399344694.0, "step": 2635 }, { "epoch": 0.778990852758926, "grad_norm": 0.30224502025680056, "learning_rate": 4.1136612021857924e-05, "loss": 0.5765, "num_tokens": 400067240.0, "step": 2640 }, { "epoch": 0.7804662142224845, "grad_norm": 0.27431991020256674, "learning_rate": 4.110928961748634e-05, "loss": 0.5901, "num_tokens": 400881589.0, "step": 2645 }, { "epoch": 0.7819415756860431, "grad_norm": 0.2869210149700479, "learning_rate": 4.108196721311476e-05, "loss": 0.571, "num_tokens": 401625109.0, "step": 2650 }, { "epoch": 0.7834169371496017, "grad_norm": 0.28041286293235673, "learning_rate": 4.105464480874317e-05, "loss": 0.5954, "num_tokens": 402373951.0, "step": 2655 }, { "epoch": 0.7848922986131602, "grad_norm": 0.26692771185065506, "learning_rate": 4.102732240437159e-05, "loss": 0.5714, "num_tokens": 403101903.0, "step": 2660 }, { "epoch": 0.7863676600767188, "grad_norm": 0.26039447931317183, "learning_rate": 4.1e-05, "loss": 0.5724, "num_tokens": 403872189.0, "step": 2665 }, { "epoch": 0.7878430215402774, "grad_norm": 0.2655796456086246, "learning_rate": 4.097267759562842e-05, "loss": 0.5819, "num_tokens": 404603485.0, "step": 2670 }, { "epoch": 0.7893183830038359, "grad_norm": 0.2880434477849914, "learning_rate": 4.0945355191256834e-05, "loss": 0.5763, "num_tokens": 405338902.0, "step": 2675 }, { "epoch": 0.7907937444673945, "grad_norm": 0.26265092692371694, "learning_rate": 4.0918032786885244e-05, "loss": 0.5602, "num_tokens": 406076997.0, "step": 2680 }, { "epoch": 0.7922691059309531, "grad_norm": 0.28800357341335125, "learning_rate": 4.089071038251367e-05, "loss": 0.5865, "num_tokens": 406799869.0, "step": 2685 }, { "epoch": 0.7937444673945117, "grad_norm": 0.26111419138314673, "learning_rate": 4.086338797814208e-05, "loss": 0.5716, "num_tokens": 407521392.0, "step": 2690 }, { "epoch": 0.7952198288580702, "grad_norm": 0.2700772463400375, "learning_rate": 4.0836065573770494e-05, "loss": 0.5769, "num_tokens": 408269239.0, "step": 2695 }, { "epoch": 0.7966951903216288, "grad_norm": 0.2869300390413025, "learning_rate": 4.080874316939891e-05, "loss": 0.5549, "num_tokens": 408981539.0, "step": 2700 }, { "epoch": 0.7981705517851874, "grad_norm": 0.27199225440580044, "learning_rate": 4.078142076502732e-05, "loss": 0.5802, "num_tokens": 409790756.0, "step": 2705 }, { "epoch": 0.7996459132487459, "grad_norm": 0.24694369705135663, "learning_rate": 4.075409836065574e-05, "loss": 0.5838, "num_tokens": 410598827.0, "step": 2710 }, { "epoch": 0.8011212747123045, "grad_norm": 0.2692198753848344, "learning_rate": 4.0726775956284154e-05, "loss": 0.5799, "num_tokens": 411370938.0, "step": 2715 }, { "epoch": 0.8025966361758631, "grad_norm": 0.3007693851311115, "learning_rate": 4.069945355191257e-05, "loss": 0.5889, "num_tokens": 412121747.0, "step": 2720 }, { "epoch": 0.8040719976394216, "grad_norm": 0.28544590477118775, "learning_rate": 4.067213114754099e-05, "loss": 0.5749, "num_tokens": 412874131.0, "step": 2725 }, { "epoch": 0.8055473591029803, "grad_norm": 0.273209274638536, "learning_rate": 4.06448087431694e-05, "loss": 0.5772, "num_tokens": 413660696.0, "step": 2730 }, { "epoch": 0.8070227205665388, "grad_norm": 0.28487425098637775, "learning_rate": 4.0617486338797814e-05, "loss": 0.5799, "num_tokens": 414421259.0, "step": 2735 }, { "epoch": 0.8084980820300973, "grad_norm": 0.28318344648670124, "learning_rate": 4.059016393442623e-05, "loss": 0.5825, "num_tokens": 415167017.0, "step": 2740 }, { "epoch": 0.809973443493656, "grad_norm": 0.2798831943114676, "learning_rate": 4.056284153005465e-05, "loss": 0.5671, "num_tokens": 415889198.0, "step": 2745 }, { "epoch": 0.8114488049572145, "grad_norm": 0.30247861046587865, "learning_rate": 4.0535519125683064e-05, "loss": 0.5653, "num_tokens": 416619954.0, "step": 2750 }, { "epoch": 0.8129241664207731, "grad_norm": 0.29866378146753964, "learning_rate": 4.0508196721311474e-05, "loss": 0.5755, "num_tokens": 417365726.0, "step": 2755 }, { "epoch": 0.8143995278843317, "grad_norm": 0.2778973640011403, "learning_rate": 4.048087431693989e-05, "loss": 0.5797, "num_tokens": 418139288.0, "step": 2760 }, { "epoch": 0.8158748893478902, "grad_norm": 0.26806183886525153, "learning_rate": 4.045355191256831e-05, "loss": 0.5641, "num_tokens": 418916235.0, "step": 2765 }, { "epoch": 0.8173502508114489, "grad_norm": 0.292802087518626, "learning_rate": 4.0426229508196724e-05, "loss": 0.5692, "num_tokens": 419638594.0, "step": 2770 }, { "epoch": 0.8188256122750074, "grad_norm": 0.2959386439378845, "learning_rate": 4.039890710382514e-05, "loss": 0.5665, "num_tokens": 420390035.0, "step": 2775 }, { "epoch": 0.8203009737385659, "grad_norm": 0.29969828195676945, "learning_rate": 4.037158469945355e-05, "loss": 0.5816, "num_tokens": 421139945.0, "step": 2780 }, { "epoch": 0.8217763352021246, "grad_norm": 0.26738631711224414, "learning_rate": 4.034426229508197e-05, "loss": 0.5807, "num_tokens": 421863378.0, "step": 2785 }, { "epoch": 0.8232516966656831, "grad_norm": 0.31694917352393587, "learning_rate": 4.0316939890710384e-05, "loss": 0.5666, "num_tokens": 422602045.0, "step": 2790 }, { "epoch": 0.8247270581292416, "grad_norm": 0.27062796374810516, "learning_rate": 4.02896174863388e-05, "loss": 0.5619, "num_tokens": 423417526.0, "step": 2795 }, { "epoch": 0.8262024195928003, "grad_norm": 0.3034950540784309, "learning_rate": 4.026229508196722e-05, "loss": 0.5823, "num_tokens": 424225100.0, "step": 2800 }, { "epoch": 0.8276777810563588, "grad_norm": 0.24894945757607537, "learning_rate": 4.023497267759563e-05, "loss": 0.5732, "num_tokens": 424989835.0, "step": 2805 }, { "epoch": 0.8291531425199173, "grad_norm": 0.2687014364688091, "learning_rate": 4.0207650273224044e-05, "loss": 0.5696, "num_tokens": 425747926.0, "step": 2810 }, { "epoch": 0.830628503983476, "grad_norm": 0.27684187455376724, "learning_rate": 4.018032786885246e-05, "loss": 0.5563, "num_tokens": 426510858.0, "step": 2815 }, { "epoch": 0.8321038654470345, "grad_norm": 0.25908781370239997, "learning_rate": 4.015300546448088e-05, "loss": 0.5595, "num_tokens": 427276555.0, "step": 2820 }, { "epoch": 0.833579226910593, "grad_norm": 0.2797854365447052, "learning_rate": 4.0125683060109295e-05, "loss": 0.5696, "num_tokens": 428059394.0, "step": 2825 }, { "epoch": 0.8350545883741517, "grad_norm": 0.27290471934354327, "learning_rate": 4.0098360655737704e-05, "loss": 0.5745, "num_tokens": 428831947.0, "step": 2830 }, { "epoch": 0.8365299498377102, "grad_norm": 0.2835956229425559, "learning_rate": 4.007103825136612e-05, "loss": 0.5763, "num_tokens": 429595741.0, "step": 2835 }, { "epoch": 0.8380053113012688, "grad_norm": 0.277729998477337, "learning_rate": 4.004371584699454e-05, "loss": 0.5657, "num_tokens": 430337115.0, "step": 2840 }, { "epoch": 0.8394806727648274, "grad_norm": 0.26429614246100896, "learning_rate": 4.0016393442622955e-05, "loss": 0.5569, "num_tokens": 431115918.0, "step": 2845 }, { "epoch": 0.8409560342283859, "grad_norm": 0.3219252358436061, "learning_rate": 3.998907103825137e-05, "loss": 0.5651, "num_tokens": 431806484.0, "step": 2850 }, { "epoch": 0.8424313956919445, "grad_norm": 0.25865117871348786, "learning_rate": 3.996174863387978e-05, "loss": 0.5719, "num_tokens": 432558682.0, "step": 2855 }, { "epoch": 0.8439067571555031, "grad_norm": 0.29717602936470355, "learning_rate": 3.99344262295082e-05, "loss": 0.5674, "num_tokens": 433320114.0, "step": 2860 }, { "epoch": 0.8453821186190617, "grad_norm": 0.27237391894941937, "learning_rate": 3.9907103825136615e-05, "loss": 0.5887, "num_tokens": 434051560.0, "step": 2865 }, { "epoch": 0.8468574800826203, "grad_norm": 0.280685891385216, "learning_rate": 3.9879781420765025e-05, "loss": 0.5882, "num_tokens": 434822411.0, "step": 2870 }, { "epoch": 0.8483328415461788, "grad_norm": 0.28844918010514436, "learning_rate": 3.985245901639345e-05, "loss": 0.5681, "num_tokens": 435567449.0, "step": 2875 }, { "epoch": 0.8498082030097374, "grad_norm": 0.2868727411030871, "learning_rate": 3.982513661202186e-05, "loss": 0.5703, "num_tokens": 436326209.0, "step": 2880 }, { "epoch": 0.851283564473296, "grad_norm": 0.2754971852422752, "learning_rate": 3.9797814207650275e-05, "loss": 0.5518, "num_tokens": 437036724.0, "step": 2885 }, { "epoch": 0.8527589259368545, "grad_norm": 0.2708868254842575, "learning_rate": 3.977049180327869e-05, "loss": 0.5604, "num_tokens": 437814621.0, "step": 2890 }, { "epoch": 0.8542342874004131, "grad_norm": 0.2886813027253405, "learning_rate": 3.97431693989071e-05, "loss": 0.5745, "num_tokens": 438622815.0, "step": 2895 }, { "epoch": 0.8557096488639717, "grad_norm": 0.3240443168468636, "learning_rate": 3.9715846994535525e-05, "loss": 0.5641, "num_tokens": 439345008.0, "step": 2900 }, { "epoch": 0.8571850103275303, "grad_norm": 0.28347932796797426, "learning_rate": 3.968852459016394e-05, "loss": 0.5679, "num_tokens": 440094850.0, "step": 2905 }, { "epoch": 0.8586603717910888, "grad_norm": 0.29103023614602186, "learning_rate": 3.966120218579235e-05, "loss": 0.5684, "num_tokens": 440856367.0, "step": 2910 }, { "epoch": 0.8601357332546474, "grad_norm": 0.2741388577647317, "learning_rate": 3.963387978142077e-05, "loss": 0.5706, "num_tokens": 441609076.0, "step": 2915 }, { "epoch": 0.861611094718206, "grad_norm": 0.2918022191574656, "learning_rate": 3.960655737704918e-05, "loss": 0.5882, "num_tokens": 442403310.0, "step": 2920 }, { "epoch": 0.8630864561817645, "grad_norm": 0.2622450022642137, "learning_rate": 3.9579234972677595e-05, "loss": 0.5669, "num_tokens": 443164467.0, "step": 2925 }, { "epoch": 0.8645618176453231, "grad_norm": 0.3784443143209066, "learning_rate": 3.955191256830602e-05, "loss": 0.556, "num_tokens": 443913893.0, "step": 2930 }, { "epoch": 0.8660371791088817, "grad_norm": 0.260398214096071, "learning_rate": 3.952459016393443e-05, "loss": 0.5735, "num_tokens": 444706267.0, "step": 2935 }, { "epoch": 0.8675125405724402, "grad_norm": 0.26252568762887457, "learning_rate": 3.9497267759562845e-05, "loss": 0.5601, "num_tokens": 445488346.0, "step": 2940 }, { "epoch": 0.8689879020359988, "grad_norm": 0.32719632414039695, "learning_rate": 3.9469945355191255e-05, "loss": 0.5738, "num_tokens": 446246174.0, "step": 2945 }, { "epoch": 0.8704632634995574, "grad_norm": 0.3041564761773453, "learning_rate": 3.944262295081967e-05, "loss": 0.5728, "num_tokens": 446990791.0, "step": 2950 }, { "epoch": 0.8719386249631159, "grad_norm": 0.28829395895866955, "learning_rate": 3.9415300546448095e-05, "loss": 0.5656, "num_tokens": 447746696.0, "step": 2955 }, { "epoch": 0.8734139864266746, "grad_norm": 0.28966519770507987, "learning_rate": 3.9387978142076505e-05, "loss": 0.5391, "num_tokens": 448441867.0, "step": 2960 }, { "epoch": 0.8748893478902331, "grad_norm": 0.2550789508105881, "learning_rate": 3.936065573770492e-05, "loss": 0.5667, "num_tokens": 449191858.0, "step": 2965 }, { "epoch": 0.8763647093537916, "grad_norm": 0.2539956819901893, "learning_rate": 3.933333333333333e-05, "loss": 0.5699, "num_tokens": 449967434.0, "step": 2970 }, { "epoch": 0.8778400708173503, "grad_norm": 0.28077971563955506, "learning_rate": 3.930601092896175e-05, "loss": 0.5726, "num_tokens": 450674793.0, "step": 2975 }, { "epoch": 0.8793154322809088, "grad_norm": 0.3005715865110719, "learning_rate": 3.9278688524590165e-05, "loss": 0.563, "num_tokens": 451383698.0, "step": 2980 }, { "epoch": 0.8807907937444674, "grad_norm": 0.2474601459397318, "learning_rate": 3.925136612021858e-05, "loss": 0.5714, "num_tokens": 452164121.0, "step": 2985 }, { "epoch": 0.882266155208026, "grad_norm": 0.2753557233553114, "learning_rate": 3.9224043715847e-05, "loss": 0.5745, "num_tokens": 452945974.0, "step": 2990 }, { "epoch": 0.8837415166715845, "grad_norm": 0.3071510350051425, "learning_rate": 3.919672131147541e-05, "loss": 0.5713, "num_tokens": 453652673.0, "step": 2995 }, { "epoch": 0.8852168781351432, "grad_norm": 0.271729740735513, "learning_rate": 3.9169398907103825e-05, "loss": 0.5456, "num_tokens": 454394965.0, "step": 3000 }, { "epoch": 0.8866922395987017, "grad_norm": 0.2819581589607196, "learning_rate": 3.914207650273224e-05, "loss": 0.5816, "num_tokens": 455175120.0, "step": 3005 }, { "epoch": 0.8881676010622602, "grad_norm": 0.25577563223015376, "learning_rate": 3.911475409836066e-05, "loss": 0.5603, "num_tokens": 455904566.0, "step": 3010 }, { "epoch": 0.8896429625258189, "grad_norm": 0.31805073530015304, "learning_rate": 3.9087431693989075e-05, "loss": 0.5616, "num_tokens": 456650012.0, "step": 3015 }, { "epoch": 0.8911183239893774, "grad_norm": 0.3114949140191548, "learning_rate": 3.9060109289617485e-05, "loss": 0.5491, "num_tokens": 457296221.0, "step": 3020 }, { "epoch": 0.8925936854529359, "grad_norm": 0.26620513324958733, "learning_rate": 3.90327868852459e-05, "loss": 0.5803, "num_tokens": 458042233.0, "step": 3025 }, { "epoch": 0.8940690469164946, "grad_norm": 0.2857214663010667, "learning_rate": 3.900546448087432e-05, "loss": 0.5652, "num_tokens": 458778645.0, "step": 3030 }, { "epoch": 0.8955444083800531, "grad_norm": 0.29525545853019264, "learning_rate": 3.8978142076502735e-05, "loss": 0.5516, "num_tokens": 459475515.0, "step": 3035 }, { "epoch": 0.8970197698436116, "grad_norm": 0.2957202977527726, "learning_rate": 3.895081967213115e-05, "loss": 0.5709, "num_tokens": 460265577.0, "step": 3040 }, { "epoch": 0.8984951313071703, "grad_norm": 0.2550018038379411, "learning_rate": 3.892349726775956e-05, "loss": 0.5558, "num_tokens": 461054939.0, "step": 3045 }, { "epoch": 0.8999704927707288, "grad_norm": 0.28594504505551155, "learning_rate": 3.889617486338798e-05, "loss": 0.5676, "num_tokens": 461729701.0, "step": 3050 }, { "epoch": 0.9014458542342874, "grad_norm": 0.2805973484693316, "learning_rate": 3.8868852459016395e-05, "loss": 0.5617, "num_tokens": 462566279.0, "step": 3055 }, { "epoch": 0.902921215697846, "grad_norm": 0.2656262709104168, "learning_rate": 3.884153005464481e-05, "loss": 0.5579, "num_tokens": 463306040.0, "step": 3060 }, { "epoch": 0.9043965771614045, "grad_norm": 0.2874718314499554, "learning_rate": 3.881420765027323e-05, "loss": 0.5757, "num_tokens": 464104131.0, "step": 3065 }, { "epoch": 0.9058719386249631, "grad_norm": 0.27071645386298127, "learning_rate": 3.878688524590164e-05, "loss": 0.5508, "num_tokens": 464810216.0, "step": 3070 }, { "epoch": 0.9073473000885217, "grad_norm": 0.2724134611605627, "learning_rate": 3.8759562841530056e-05, "loss": 0.5509, "num_tokens": 465539009.0, "step": 3075 }, { "epoch": 0.9088226615520802, "grad_norm": 0.26616824832726343, "learning_rate": 3.873224043715847e-05, "loss": 0.565, "num_tokens": 466336695.0, "step": 3080 }, { "epoch": 0.9102980230156388, "grad_norm": 0.2739408501578057, "learning_rate": 3.870491803278688e-05, "loss": 0.5651, "num_tokens": 467110499.0, "step": 3085 }, { "epoch": 0.9117733844791974, "grad_norm": 0.28921302534718674, "learning_rate": 3.8677595628415306e-05, "loss": 0.5601, "num_tokens": 467841513.0, "step": 3090 }, { "epoch": 0.913248745942756, "grad_norm": 0.27067424547213925, "learning_rate": 3.865027322404372e-05, "loss": 0.5701, "num_tokens": 468598127.0, "step": 3095 }, { "epoch": 0.9147241074063146, "grad_norm": 0.2798672565080446, "learning_rate": 3.862295081967213e-05, "loss": 0.5697, "num_tokens": 469383624.0, "step": 3100 }, { "epoch": 0.9161994688698731, "grad_norm": 0.29492001072991003, "learning_rate": 3.859562841530055e-05, "loss": 0.5556, "num_tokens": 470133264.0, "step": 3105 }, { "epoch": 0.9176748303334317, "grad_norm": 0.29855054904426287, "learning_rate": 3.856830601092896e-05, "loss": 0.5552, "num_tokens": 470832188.0, "step": 3110 }, { "epoch": 0.9191501917969903, "grad_norm": 0.2681096833656888, "learning_rate": 3.854098360655738e-05, "loss": 0.5618, "num_tokens": 471583550.0, "step": 3115 }, { "epoch": 0.9206255532605488, "grad_norm": 0.2653182971051013, "learning_rate": 3.85136612021858e-05, "loss": 0.5649, "num_tokens": 472370789.0, "step": 3120 }, { "epoch": 0.9221009147241074, "grad_norm": 0.24799471288423655, "learning_rate": 3.848633879781421e-05, "loss": 0.5585, "num_tokens": 473135401.0, "step": 3125 }, { "epoch": 0.923576276187666, "grad_norm": 0.265953883352543, "learning_rate": 3.8459016393442626e-05, "loss": 0.5657, "num_tokens": 473850513.0, "step": 3130 }, { "epoch": 0.9250516376512246, "grad_norm": 0.2603459320628356, "learning_rate": 3.8431693989071036e-05, "loss": 0.5885, "num_tokens": 474637524.0, "step": 3135 }, { "epoch": 0.9265269991147831, "grad_norm": 0.28068925148560336, "learning_rate": 3.840437158469945e-05, "loss": 0.5529, "num_tokens": 475340469.0, "step": 3140 }, { "epoch": 0.9280023605783417, "grad_norm": 0.27132880553212974, "learning_rate": 3.8377049180327876e-05, "loss": 0.5698, "num_tokens": 476130519.0, "step": 3145 }, { "epoch": 0.9294777220419003, "grad_norm": 0.2872412592121913, "learning_rate": 3.8349726775956286e-05, "loss": 0.5699, "num_tokens": 476958291.0, "step": 3150 }, { "epoch": 0.9309530835054588, "grad_norm": 0.27525096228744045, "learning_rate": 3.83224043715847e-05, "loss": 0.5638, "num_tokens": 477717600.0, "step": 3155 }, { "epoch": 0.9324284449690174, "grad_norm": 0.271795306489134, "learning_rate": 3.829508196721311e-05, "loss": 0.5573, "num_tokens": 478454073.0, "step": 3160 }, { "epoch": 0.933903806432576, "grad_norm": 0.2836817185577235, "learning_rate": 3.826775956284153e-05, "loss": 0.5804, "num_tokens": 479228740.0, "step": 3165 }, { "epoch": 0.9353791678961345, "grad_norm": 0.2624346452206753, "learning_rate": 3.824043715846995e-05, "loss": 0.5572, "num_tokens": 479963287.0, "step": 3170 }, { "epoch": 0.9368545293596932, "grad_norm": 0.31605973317893116, "learning_rate": 3.821311475409836e-05, "loss": 0.5554, "num_tokens": 480713563.0, "step": 3175 }, { "epoch": 0.9383298908232517, "grad_norm": 0.2609356079831417, "learning_rate": 3.818579234972678e-05, "loss": 0.5642, "num_tokens": 481515657.0, "step": 3180 }, { "epoch": 0.9398052522868102, "grad_norm": 0.2707792188421102, "learning_rate": 3.815846994535519e-05, "loss": 0.5663, "num_tokens": 482307597.0, "step": 3185 }, { "epoch": 0.9412806137503689, "grad_norm": 0.27047747120993004, "learning_rate": 3.8131147540983606e-05, "loss": 0.5604, "num_tokens": 483073692.0, "step": 3190 }, { "epoch": 0.9427559752139274, "grad_norm": 0.24682085541915882, "learning_rate": 3.810382513661202e-05, "loss": 0.5527, "num_tokens": 483824821.0, "step": 3195 }, { "epoch": 0.9442313366774859, "grad_norm": 0.264564716026836, "learning_rate": 3.807650273224044e-05, "loss": 0.5404, "num_tokens": 484573661.0, "step": 3200 }, { "epoch": 0.9457066981410446, "grad_norm": 0.24188898860106076, "learning_rate": 3.8049180327868856e-05, "loss": 0.5746, "num_tokens": 485369303.0, "step": 3205 }, { "epoch": 0.9471820596046031, "grad_norm": 0.29467573005006337, "learning_rate": 3.8021857923497266e-05, "loss": 0.5347, "num_tokens": 486074755.0, "step": 3210 }, { "epoch": 0.9486574210681618, "grad_norm": 0.2639098954166498, "learning_rate": 3.799453551912568e-05, "loss": 0.5714, "num_tokens": 486819557.0, "step": 3215 }, { "epoch": 0.9501327825317203, "grad_norm": 0.2633397429248263, "learning_rate": 3.79672131147541e-05, "loss": 0.5602, "num_tokens": 487592078.0, "step": 3220 }, { "epoch": 0.9516081439952788, "grad_norm": 0.26286544096483416, "learning_rate": 3.7939890710382516e-05, "loss": 0.558, "num_tokens": 488400528.0, "step": 3225 }, { "epoch": 0.9530835054588375, "grad_norm": 0.2702099010597992, "learning_rate": 3.791256830601093e-05, "loss": 0.5723, "num_tokens": 489150714.0, "step": 3230 }, { "epoch": 0.954558866922396, "grad_norm": 0.26097265326918345, "learning_rate": 3.788524590163934e-05, "loss": 0.5661, "num_tokens": 489942905.0, "step": 3235 }, { "epoch": 0.9560342283859545, "grad_norm": 0.26373422028179694, "learning_rate": 3.785792349726776e-05, "loss": 0.5716, "num_tokens": 490782480.0, "step": 3240 }, { "epoch": 0.9575095898495132, "grad_norm": 0.2555802308739186, "learning_rate": 3.7830601092896176e-05, "loss": 0.5609, "num_tokens": 491572600.0, "step": 3245 }, { "epoch": 0.9589849513130717, "grad_norm": 0.2833052155271193, "learning_rate": 3.780327868852459e-05, "loss": 0.5631, "num_tokens": 492306051.0, "step": 3250 }, { "epoch": 0.9604603127766302, "grad_norm": 0.2923145983249554, "learning_rate": 3.777595628415301e-05, "loss": 0.5515, "num_tokens": 493040696.0, "step": 3255 }, { "epoch": 0.9619356742401889, "grad_norm": 0.2599513657129176, "learning_rate": 3.774863387978142e-05, "loss": 0.5663, "num_tokens": 493828094.0, "step": 3260 }, { "epoch": 0.9634110357037474, "grad_norm": 0.265642907447618, "learning_rate": 3.7721311475409836e-05, "loss": 0.5464, "num_tokens": 494582945.0, "step": 3265 }, { "epoch": 0.964886397167306, "grad_norm": 0.26873677234962906, "learning_rate": 3.769398907103825e-05, "loss": 0.544, "num_tokens": 495304011.0, "step": 3270 }, { "epoch": 0.9663617586308646, "grad_norm": 0.2504370505699889, "learning_rate": 3.766666666666667e-05, "loss": 0.5587, "num_tokens": 496087264.0, "step": 3275 }, { "epoch": 0.9678371200944231, "grad_norm": 0.2756766940189323, "learning_rate": 3.7639344262295086e-05, "loss": 0.5514, "num_tokens": 496854635.0, "step": 3280 }, { "epoch": 0.9693124815579817, "grad_norm": 0.2715257867110226, "learning_rate": 3.76120218579235e-05, "loss": 0.5679, "num_tokens": 497591596.0, "step": 3285 }, { "epoch": 0.9707878430215403, "grad_norm": 0.38831636550180015, "learning_rate": 3.758469945355191e-05, "loss": 0.5515, "num_tokens": 498371750.0, "step": 3290 }, { "epoch": 0.9722632044850988, "grad_norm": 0.24745302187549728, "learning_rate": 3.755737704918033e-05, "loss": 0.5433, "num_tokens": 499130655.0, "step": 3295 }, { "epoch": 0.9737385659486574, "grad_norm": 0.33063308282217413, "learning_rate": 3.7530054644808746e-05, "loss": 0.5407, "num_tokens": 499792945.0, "step": 3300 }, { "epoch": 0.975213927412216, "grad_norm": 0.2690018487189958, "learning_rate": 3.750273224043716e-05, "loss": 0.5598, "num_tokens": 500532609.0, "step": 3305 }, { "epoch": 0.9766892888757746, "grad_norm": 0.2572475559674306, "learning_rate": 3.747540983606558e-05, "loss": 0.5312, "num_tokens": 501244629.0, "step": 3310 }, { "epoch": 0.9781646503393331, "grad_norm": 0.2760255552755503, "learning_rate": 3.744808743169399e-05, "loss": 0.5599, "num_tokens": 502020093.0, "step": 3315 }, { "epoch": 0.9796400118028917, "grad_norm": 0.25786862017763107, "learning_rate": 3.7420765027322407e-05, "loss": 0.5392, "num_tokens": 502725869.0, "step": 3320 }, { "epoch": 0.9811153732664503, "grad_norm": 0.25122239625173465, "learning_rate": 3.7393442622950816e-05, "loss": 0.5564, "num_tokens": 503521425.0, "step": 3325 }, { "epoch": 0.9825907347300088, "grad_norm": 0.2566185252731489, "learning_rate": 3.736612021857924e-05, "loss": 0.5792, "num_tokens": 504340233.0, "step": 3330 }, { "epoch": 0.9840660961935674, "grad_norm": 0.2707389885245592, "learning_rate": 3.733879781420766e-05, "loss": 0.5683, "num_tokens": 505138573.0, "step": 3335 }, { "epoch": 0.985541457657126, "grad_norm": 0.2696201759824027, "learning_rate": 3.7311475409836067e-05, "loss": 0.5678, "num_tokens": 505902907.0, "step": 3340 }, { "epoch": 0.9870168191206846, "grad_norm": 0.26718746299464563, "learning_rate": 3.728415300546448e-05, "loss": 0.5577, "num_tokens": 506621124.0, "step": 3345 }, { "epoch": 0.9884921805842432, "grad_norm": 0.2546356629716283, "learning_rate": 3.725683060109289e-05, "loss": 0.5497, "num_tokens": 507364304.0, "step": 3350 }, { "epoch": 0.9899675420478017, "grad_norm": 0.26262748711956263, "learning_rate": 3.722950819672131e-05, "loss": 0.5608, "num_tokens": 508160302.0, "step": 3355 }, { "epoch": 0.9914429035113603, "grad_norm": 0.25179335097073635, "learning_rate": 3.7202185792349733e-05, "loss": 0.5503, "num_tokens": 508937327.0, "step": 3360 }, { "epoch": 0.9929182649749189, "grad_norm": 0.28053727408493906, "learning_rate": 3.717486338797814e-05, "loss": 0.5445, "num_tokens": 509616623.0, "step": 3365 }, { "epoch": 0.9943936264384774, "grad_norm": 0.28384907198578063, "learning_rate": 3.714754098360656e-05, "loss": 0.5551, "num_tokens": 510416500.0, "step": 3370 }, { "epoch": 0.995868987902036, "grad_norm": 0.2960236091504355, "learning_rate": 3.712021857923497e-05, "loss": 0.5392, "num_tokens": 511168328.0, "step": 3375 }, { "epoch": 0.9973443493655946, "grad_norm": 0.25662941263223643, "learning_rate": 3.709289617486339e-05, "loss": 0.5544, "num_tokens": 511951575.0, "step": 3380 }, { "epoch": 0.9988197108291531, "grad_norm": 0.25107635188029376, "learning_rate": 3.706557377049181e-05, "loss": 0.54, "num_tokens": 512601475.0, "step": 3385 }, { "epoch": 1.0002950722927118, "grad_norm": 0.2701035814922801, "learning_rate": 3.703825136612022e-05, "loss": 0.539, "num_tokens": 513267860.0, "step": 3390 }, { "epoch": 1.0017704337562703, "grad_norm": 0.28803036904885937, "learning_rate": 3.701092896174864e-05, "loss": 0.4936, "num_tokens": 513998543.0, "step": 3395 }, { "epoch": 1.0032457952198288, "grad_norm": 0.26539135380004236, "learning_rate": 3.698360655737705e-05, "loss": 0.4996, "num_tokens": 514784938.0, "step": 3400 }, { "epoch": 1.0047211566833874, "grad_norm": 0.2586874144438319, "learning_rate": 3.6956284153005463e-05, "loss": 0.5044, "num_tokens": 515540731.0, "step": 3405 }, { "epoch": 1.006196518146946, "grad_norm": 0.29096732985925616, "learning_rate": 3.692896174863388e-05, "loss": 0.5033, "num_tokens": 516296925.0, "step": 3410 }, { "epoch": 1.0076718796105046, "grad_norm": 0.25745103082438253, "learning_rate": 3.69016393442623e-05, "loss": 0.5011, "num_tokens": 517056166.0, "step": 3415 }, { "epoch": 1.0091472410740632, "grad_norm": 0.27012763014708097, "learning_rate": 3.6874316939890714e-05, "loss": 0.4993, "num_tokens": 517790970.0, "step": 3420 }, { "epoch": 1.0106226025376217, "grad_norm": 0.2684944262513349, "learning_rate": 3.6846994535519124e-05, "loss": 0.4903, "num_tokens": 518577134.0, "step": 3425 }, { "epoch": 1.0120979640011802, "grad_norm": 0.2651776075808073, "learning_rate": 3.681967213114754e-05, "loss": 0.5172, "num_tokens": 519298646.0, "step": 3430 }, { "epoch": 1.0135733254647388, "grad_norm": 0.2960717950041596, "learning_rate": 3.679234972677596e-05, "loss": 0.4779, "num_tokens": 520006330.0, "step": 3435 }, { "epoch": 1.0150486869282975, "grad_norm": 0.293116532057976, "learning_rate": 3.6765027322404374e-05, "loss": 0.491, "num_tokens": 520714596.0, "step": 3440 }, { "epoch": 1.016524048391856, "grad_norm": 0.266009779121358, "learning_rate": 3.673770491803279e-05, "loss": 0.5031, "num_tokens": 521451669.0, "step": 3445 }, { "epoch": 1.0179994098554146, "grad_norm": 0.2701086213950421, "learning_rate": 3.67103825136612e-05, "loss": 0.5192, "num_tokens": 522284568.0, "step": 3450 }, { "epoch": 1.0194747713189731, "grad_norm": 0.2697049361318903, "learning_rate": 3.668306010928962e-05, "loss": 0.4826, "num_tokens": 523006119.0, "step": 3455 }, { "epoch": 1.0209501327825317, "grad_norm": 0.3095617483694332, "learning_rate": 3.6655737704918034e-05, "loss": 0.4855, "num_tokens": 523653135.0, "step": 3460 }, { "epoch": 1.0224254942460902, "grad_norm": 0.2599542527098017, "learning_rate": 3.662841530054645e-05, "loss": 0.4923, "num_tokens": 524410753.0, "step": 3465 }, { "epoch": 1.023900855709649, "grad_norm": 0.29116494044466223, "learning_rate": 3.660109289617487e-05, "loss": 0.4854, "num_tokens": 525165674.0, "step": 3470 }, { "epoch": 1.0253762171732075, "grad_norm": 0.27175522876759184, "learning_rate": 3.6573770491803284e-05, "loss": 0.5, "num_tokens": 525960389.0, "step": 3475 }, { "epoch": 1.026851578636766, "grad_norm": 0.32482501333880326, "learning_rate": 3.6546448087431694e-05, "loss": 0.4942, "num_tokens": 526697425.0, "step": 3480 }, { "epoch": 1.0283269401003245, "grad_norm": 0.24797858819831434, "learning_rate": 3.651912568306011e-05, "loss": 0.5026, "num_tokens": 527496071.0, "step": 3485 }, { "epoch": 1.029802301563883, "grad_norm": 0.2683966290487284, "learning_rate": 3.649180327868853e-05, "loss": 0.4966, "num_tokens": 528246767.0, "step": 3490 }, { "epoch": 1.0312776630274416, "grad_norm": 0.25027650708548776, "learning_rate": 3.6464480874316944e-05, "loss": 0.5186, "num_tokens": 529049171.0, "step": 3495 }, { "epoch": 1.0327530244910004, "grad_norm": 0.27375454629811447, "learning_rate": 3.643715846994536e-05, "loss": 0.5007, "num_tokens": 529825493.0, "step": 3500 }, { "epoch": 1.034228385954559, "grad_norm": 0.25818798257860875, "learning_rate": 3.640983606557377e-05, "loss": 0.4836, "num_tokens": 530540463.0, "step": 3505 }, { "epoch": 1.0357037474181174, "grad_norm": 0.28944129192073265, "learning_rate": 3.638251366120219e-05, "loss": 0.4945, "num_tokens": 531285138.0, "step": 3510 }, { "epoch": 1.037179108881676, "grad_norm": 0.2614949834631852, "learning_rate": 3.6355191256830604e-05, "loss": 0.4948, "num_tokens": 532014489.0, "step": 3515 }, { "epoch": 1.0386544703452345, "grad_norm": 0.25615572028718275, "learning_rate": 3.632786885245902e-05, "loss": 0.4911, "num_tokens": 532775678.0, "step": 3520 }, { "epoch": 1.0401298318087933, "grad_norm": 0.268088315622947, "learning_rate": 3.630054644808744e-05, "loss": 0.4954, "num_tokens": 533572181.0, "step": 3525 }, { "epoch": 1.0416051932723518, "grad_norm": 0.30298721013298824, "learning_rate": 3.627322404371585e-05, "loss": 0.489, "num_tokens": 534305891.0, "step": 3530 }, { "epoch": 1.0430805547359103, "grad_norm": 0.25969176453485354, "learning_rate": 3.6245901639344264e-05, "loss": 0.5109, "num_tokens": 535028223.0, "step": 3535 }, { "epoch": 1.0445559161994689, "grad_norm": 0.26944023691366575, "learning_rate": 3.6218579234972674e-05, "loss": 0.5116, "num_tokens": 535855706.0, "step": 3540 }, { "epoch": 1.0460312776630274, "grad_norm": 0.26206929067792983, "learning_rate": 3.61912568306011e-05, "loss": 0.4962, "num_tokens": 536656996.0, "step": 3545 }, { "epoch": 1.047506639126586, "grad_norm": 0.2798104243297531, "learning_rate": 3.6163934426229514e-05, "loss": 0.5006, "num_tokens": 537464921.0, "step": 3550 }, { "epoch": 1.0489820005901447, "grad_norm": 0.24610127239531418, "learning_rate": 3.6136612021857924e-05, "loss": 0.517, "num_tokens": 538233995.0, "step": 3555 }, { "epoch": 1.0504573620537032, "grad_norm": 0.28425092754206155, "learning_rate": 3.610928961748634e-05, "loss": 0.5045, "num_tokens": 539024121.0, "step": 3560 }, { "epoch": 1.0519327235172617, "grad_norm": 0.2674003091104434, "learning_rate": 3.608196721311475e-05, "loss": 0.4726, "num_tokens": 539726556.0, "step": 3565 }, { "epoch": 1.0534080849808203, "grad_norm": 0.2665744809768469, "learning_rate": 3.6054644808743174e-05, "loss": 0.4899, "num_tokens": 540500766.0, "step": 3570 }, { "epoch": 1.0548834464443788, "grad_norm": 0.2896041277050895, "learning_rate": 3.602732240437159e-05, "loss": 0.4998, "num_tokens": 541283003.0, "step": 3575 }, { "epoch": 1.0563588079079373, "grad_norm": 0.27575018655388567, "learning_rate": 3.6e-05, "loss": 0.5044, "num_tokens": 542044355.0, "step": 3580 }, { "epoch": 1.057834169371496, "grad_norm": 0.27513051217119255, "learning_rate": 3.597267759562842e-05, "loss": 0.5003, "num_tokens": 542841935.0, "step": 3585 }, { "epoch": 1.0593095308350546, "grad_norm": 0.2748964967623885, "learning_rate": 3.594535519125683e-05, "loss": 0.5009, "num_tokens": 543588404.0, "step": 3590 }, { "epoch": 1.0607848922986132, "grad_norm": 0.2784065722985205, "learning_rate": 3.5918032786885244e-05, "loss": 0.4792, "num_tokens": 544325916.0, "step": 3595 }, { "epoch": 1.0622602537621717, "grad_norm": 0.278948479560299, "learning_rate": 3.589071038251367e-05, "loss": 0.5029, "num_tokens": 545095812.0, "step": 3600 }, { "epoch": 1.0637356152257302, "grad_norm": 0.2370661826911396, "learning_rate": 3.586338797814208e-05, "loss": 0.5104, "num_tokens": 545881432.0, "step": 3605 }, { "epoch": 1.065210976689289, "grad_norm": 0.2625803836257536, "learning_rate": 3.5836065573770494e-05, "loss": 0.4985, "num_tokens": 546646141.0, "step": 3610 }, { "epoch": 1.0666863381528475, "grad_norm": 0.2573870601771387, "learning_rate": 3.5808743169398904e-05, "loss": 0.4883, "num_tokens": 547428649.0, "step": 3615 }, { "epoch": 1.068161699616406, "grad_norm": 0.2638808508290738, "learning_rate": 3.578142076502732e-05, "loss": 0.4901, "num_tokens": 548192281.0, "step": 3620 }, { "epoch": 1.0696370610799646, "grad_norm": 0.6095032086359355, "learning_rate": 3.5754098360655745e-05, "loss": 0.498, "num_tokens": 548852617.0, "step": 3625 }, { "epoch": 1.0711124225435231, "grad_norm": 0.28407199061789346, "learning_rate": 3.5726775956284154e-05, "loss": 0.4962, "num_tokens": 549543339.0, "step": 3630 }, { "epoch": 1.0725877840070817, "grad_norm": 0.2655106914267652, "learning_rate": 3.569945355191257e-05, "loss": 0.5014, "num_tokens": 550235282.0, "step": 3635 }, { "epoch": 1.0740631454706402, "grad_norm": 0.2807563651010842, "learning_rate": 3.567213114754099e-05, "loss": 0.506, "num_tokens": 551024581.0, "step": 3640 }, { "epoch": 1.075538506934199, "grad_norm": 0.6440579555100152, "learning_rate": 3.56448087431694e-05, "loss": 0.5065, "num_tokens": 551783071.0, "step": 3645 }, { "epoch": 1.0770138683977575, "grad_norm": 0.25707143978714236, "learning_rate": 3.5617486338797815e-05, "loss": 0.4981, "num_tokens": 552552869.0, "step": 3650 }, { "epoch": 1.078489229861316, "grad_norm": 0.2826747048961587, "learning_rate": 3.559016393442623e-05, "loss": 0.4743, "num_tokens": 553286827.0, "step": 3655 }, { "epoch": 1.0799645913248745, "grad_norm": 0.2884642543000289, "learning_rate": 3.556284153005465e-05, "loss": 0.4961, "num_tokens": 554031843.0, "step": 3660 }, { "epoch": 1.081439952788433, "grad_norm": 0.26607639720953985, "learning_rate": 3.5535519125683065e-05, "loss": 0.5036, "num_tokens": 554752401.0, "step": 3665 }, { "epoch": 1.0829153142519918, "grad_norm": 0.26877586070697074, "learning_rate": 3.5508196721311475e-05, "loss": 0.5058, "num_tokens": 555527221.0, "step": 3670 }, { "epoch": 1.0843906757155504, "grad_norm": 0.29317372558635557, "learning_rate": 3.548087431693989e-05, "loss": 0.5065, "num_tokens": 556227326.0, "step": 3675 }, { "epoch": 1.085866037179109, "grad_norm": 0.28061725316126257, "learning_rate": 3.545355191256831e-05, "loss": 0.4969, "num_tokens": 557036933.0, "step": 3680 }, { "epoch": 1.0873413986426674, "grad_norm": 0.28541307080646455, "learning_rate": 3.5426229508196725e-05, "loss": 0.5155, "num_tokens": 557852580.0, "step": 3685 }, { "epoch": 1.088816760106226, "grad_norm": 0.2705683934425315, "learning_rate": 3.539890710382514e-05, "loss": 0.4917, "num_tokens": 558629565.0, "step": 3690 }, { "epoch": 1.0902921215697845, "grad_norm": 0.2812559173568951, "learning_rate": 3.537158469945355e-05, "loss": 0.5004, "num_tokens": 559396677.0, "step": 3695 }, { "epoch": 1.0917674830333433, "grad_norm": 0.24561545497607268, "learning_rate": 3.534426229508197e-05, "loss": 0.4947, "num_tokens": 560108378.0, "step": 3700 }, { "epoch": 1.0932428444969018, "grad_norm": 0.25973989161372435, "learning_rate": 3.5316939890710385e-05, "loss": 0.4855, "num_tokens": 560822721.0, "step": 3705 }, { "epoch": 1.0947182059604603, "grad_norm": 0.25956185809286564, "learning_rate": 3.52896174863388e-05, "loss": 0.4987, "num_tokens": 561587276.0, "step": 3710 }, { "epoch": 1.0961935674240189, "grad_norm": 0.264534922205557, "learning_rate": 3.526229508196722e-05, "loss": 0.4836, "num_tokens": 562280559.0, "step": 3715 }, { "epoch": 1.0976689288875774, "grad_norm": 0.26600974711546843, "learning_rate": 3.523497267759563e-05, "loss": 0.5137, "num_tokens": 563052314.0, "step": 3720 }, { "epoch": 1.099144290351136, "grad_norm": 0.2973740753886228, "learning_rate": 3.5207650273224045e-05, "loss": 0.4982, "num_tokens": 563802221.0, "step": 3725 }, { "epoch": 1.1006196518146947, "grad_norm": 0.26758378328556187, "learning_rate": 3.518032786885246e-05, "loss": 0.5, "num_tokens": 564626173.0, "step": 3730 }, { "epoch": 1.1020950132782532, "grad_norm": 0.3211733229464461, "learning_rate": 3.515300546448088e-05, "loss": 0.4826, "num_tokens": 565340598.0, "step": 3735 }, { "epoch": 1.1035703747418117, "grad_norm": 0.26290586699723634, "learning_rate": 3.5125683060109295e-05, "loss": 0.4925, "num_tokens": 566088505.0, "step": 3740 }, { "epoch": 1.1050457362053703, "grad_norm": 0.2797315050467399, "learning_rate": 3.5098360655737705e-05, "loss": 0.4996, "num_tokens": 566887287.0, "step": 3745 }, { "epoch": 1.1065210976689288, "grad_norm": 0.2543066383799046, "learning_rate": 3.507103825136612e-05, "loss": 0.4995, "num_tokens": 567670847.0, "step": 3750 }, { "epoch": 1.1079964591324876, "grad_norm": 0.23693952962194706, "learning_rate": 3.504371584699453e-05, "loss": 0.485, "num_tokens": 568441311.0, "step": 3755 }, { "epoch": 1.109471820596046, "grad_norm": 0.28772412339340586, "learning_rate": 3.5016393442622955e-05, "loss": 0.4643, "num_tokens": 569181480.0, "step": 3760 }, { "epoch": 1.1109471820596046, "grad_norm": 0.25659454627715383, "learning_rate": 3.498907103825137e-05, "loss": 0.5117, "num_tokens": 570007804.0, "step": 3765 }, { "epoch": 1.1124225435231632, "grad_norm": 0.28646595584039597, "learning_rate": 3.496174863387978e-05, "loss": 0.5071, "num_tokens": 570756938.0, "step": 3770 }, { "epoch": 1.1138979049867217, "grad_norm": 0.2503881593358946, "learning_rate": 3.49344262295082e-05, "loss": 0.5028, "num_tokens": 571578360.0, "step": 3775 }, { "epoch": 1.1153732664502802, "grad_norm": 0.2909565399070889, "learning_rate": 3.490710382513661e-05, "loss": 0.5128, "num_tokens": 572388243.0, "step": 3780 }, { "epoch": 1.116848627913839, "grad_norm": 0.3205753131648363, "learning_rate": 3.487978142076503e-05, "loss": 0.5069, "num_tokens": 573173912.0, "step": 3785 }, { "epoch": 1.1183239893773975, "grad_norm": 0.2769257065311571, "learning_rate": 3.485245901639345e-05, "loss": 0.5017, "num_tokens": 573941357.0, "step": 3790 }, { "epoch": 1.119799350840956, "grad_norm": 0.2626256492957004, "learning_rate": 3.482513661202186e-05, "loss": 0.4835, "num_tokens": 574651492.0, "step": 3795 }, { "epoch": 1.1212747123045146, "grad_norm": 0.27269314655320026, "learning_rate": 3.4797814207650275e-05, "loss": 0.4973, "num_tokens": 575439480.0, "step": 3800 }, { "epoch": 1.1227500737680731, "grad_norm": 0.26559969988599025, "learning_rate": 3.4770491803278685e-05, "loss": 0.4981, "num_tokens": 576203026.0, "step": 3805 }, { "epoch": 1.1242254352316317, "grad_norm": 0.26990111460116173, "learning_rate": 3.47431693989071e-05, "loss": 0.5041, "num_tokens": 576931561.0, "step": 3810 }, { "epoch": 1.1257007966951904, "grad_norm": 0.2645804367893137, "learning_rate": 3.4715846994535525e-05, "loss": 0.4837, "num_tokens": 577703164.0, "step": 3815 }, { "epoch": 1.127176158158749, "grad_norm": 0.2842861919112397, "learning_rate": 3.4688524590163935e-05, "loss": 0.4899, "num_tokens": 578425486.0, "step": 3820 }, { "epoch": 1.1286515196223075, "grad_norm": 0.28528494338344823, "learning_rate": 3.466120218579235e-05, "loss": 0.5101, "num_tokens": 579202898.0, "step": 3825 }, { "epoch": 1.130126881085866, "grad_norm": 0.26707435856869843, "learning_rate": 3.463387978142077e-05, "loss": 0.4912, "num_tokens": 579930704.0, "step": 3830 }, { "epoch": 1.1316022425494245, "grad_norm": 0.28716403736206353, "learning_rate": 3.460655737704918e-05, "loss": 0.4717, "num_tokens": 580619075.0, "step": 3835 }, { "epoch": 1.1330776040129833, "grad_norm": 0.2766751345468759, "learning_rate": 3.45792349726776e-05, "loss": 0.4928, "num_tokens": 581364279.0, "step": 3840 }, { "epoch": 1.1345529654765418, "grad_norm": 0.2782785926769555, "learning_rate": 3.455191256830601e-05, "loss": 0.4807, "num_tokens": 582082470.0, "step": 3845 }, { "epoch": 1.1360283269401004, "grad_norm": 0.2694541673458798, "learning_rate": 3.452459016393443e-05, "loss": 0.4819, "num_tokens": 582791235.0, "step": 3850 }, { "epoch": 1.137503688403659, "grad_norm": 0.29265150235891035, "learning_rate": 3.4497267759562845e-05, "loss": 0.4781, "num_tokens": 583502050.0, "step": 3855 }, { "epoch": 1.1389790498672174, "grad_norm": 0.2749479261828194, "learning_rate": 3.4469945355191255e-05, "loss": 0.485, "num_tokens": 584280670.0, "step": 3860 }, { "epoch": 1.140454411330776, "grad_norm": 0.28426968865213137, "learning_rate": 3.444262295081967e-05, "loss": 0.5115, "num_tokens": 585059875.0, "step": 3865 }, { "epoch": 1.1419297727943345, "grad_norm": 0.2793155395792678, "learning_rate": 3.441530054644809e-05, "loss": 0.5066, "num_tokens": 585772155.0, "step": 3870 }, { "epoch": 1.1434051342578933, "grad_norm": 0.2549845634773826, "learning_rate": 3.4387978142076505e-05, "loss": 0.4882, "num_tokens": 586520733.0, "step": 3875 }, { "epoch": 1.1448804957214518, "grad_norm": 0.26617286280364333, "learning_rate": 3.436065573770492e-05, "loss": 0.4895, "num_tokens": 587229432.0, "step": 3880 }, { "epoch": 1.1463558571850103, "grad_norm": 0.2736636673948781, "learning_rate": 3.433333333333333e-05, "loss": 0.4881, "num_tokens": 587975401.0, "step": 3885 }, { "epoch": 1.1478312186485689, "grad_norm": 0.25949891933525904, "learning_rate": 3.430601092896175e-05, "loss": 0.4873, "num_tokens": 588762578.0, "step": 3890 }, { "epoch": 1.1493065801121274, "grad_norm": 0.26472006031880013, "learning_rate": 3.4278688524590166e-05, "loss": 0.4882, "num_tokens": 589487023.0, "step": 3895 }, { "epoch": 1.1507819415756861, "grad_norm": 0.25555227465983993, "learning_rate": 3.425136612021858e-05, "loss": 0.4968, "num_tokens": 590243985.0, "step": 3900 }, { "epoch": 1.1522573030392447, "grad_norm": 0.33556080205458233, "learning_rate": 3.4224043715847e-05, "loss": 0.5019, "num_tokens": 590999553.0, "step": 3905 }, { "epoch": 1.1537326645028032, "grad_norm": 0.2888735216669446, "learning_rate": 3.419672131147541e-05, "loss": 0.5025, "num_tokens": 591820094.0, "step": 3910 }, { "epoch": 1.1552080259663617, "grad_norm": 0.7102174872088697, "learning_rate": 3.4169398907103826e-05, "loss": 0.4921, "num_tokens": 592558185.0, "step": 3915 }, { "epoch": 1.1566833874299203, "grad_norm": 0.2674441190347782, "learning_rate": 3.414207650273224e-05, "loss": 0.4896, "num_tokens": 593309782.0, "step": 3920 }, { "epoch": 1.1581587488934788, "grad_norm": 0.2598427859391907, "learning_rate": 3.411475409836066e-05, "loss": 0.4915, "num_tokens": 594069370.0, "step": 3925 }, { "epoch": 1.1596341103570376, "grad_norm": 0.26048862244886667, "learning_rate": 3.4087431693989076e-05, "loss": 0.4776, "num_tokens": 594821366.0, "step": 3930 }, { "epoch": 1.161109471820596, "grad_norm": 0.2792995510187664, "learning_rate": 3.4060109289617486e-05, "loss": 0.4936, "num_tokens": 595563933.0, "step": 3935 }, { "epoch": 1.1625848332841546, "grad_norm": 0.2788977738197144, "learning_rate": 3.40327868852459e-05, "loss": 0.5041, "num_tokens": 596266604.0, "step": 3940 }, { "epoch": 1.1640601947477132, "grad_norm": 0.2702677637954942, "learning_rate": 3.400546448087432e-05, "loss": 0.4974, "num_tokens": 597018525.0, "step": 3945 }, { "epoch": 1.1655355562112717, "grad_norm": 0.27650623422020126, "learning_rate": 3.3978142076502736e-05, "loss": 0.4848, "num_tokens": 597761871.0, "step": 3950 }, { "epoch": 1.1670109176748302, "grad_norm": 0.27577261722703256, "learning_rate": 3.395081967213115e-05, "loss": 0.4955, "num_tokens": 598512598.0, "step": 3955 }, { "epoch": 1.168486279138389, "grad_norm": 0.2909230963018749, "learning_rate": 3.392349726775956e-05, "loss": 0.482, "num_tokens": 599254008.0, "step": 3960 }, { "epoch": 1.1699616406019475, "grad_norm": 0.2786280916626819, "learning_rate": 3.389617486338798e-05, "loss": 0.4868, "num_tokens": 599974774.0, "step": 3965 }, { "epoch": 1.171437002065506, "grad_norm": 0.2882810416564183, "learning_rate": 3.386885245901639e-05, "loss": 0.4721, "num_tokens": 600697404.0, "step": 3970 }, { "epoch": 1.1729123635290646, "grad_norm": 0.2576661022948717, "learning_rate": 3.384153005464481e-05, "loss": 0.4973, "num_tokens": 601489819.0, "step": 3975 }, { "epoch": 1.1743877249926231, "grad_norm": 0.3176591478048185, "learning_rate": 3.381420765027323e-05, "loss": 0.501, "num_tokens": 602302093.0, "step": 3980 }, { "epoch": 1.1758630864561819, "grad_norm": 0.24544887118037986, "learning_rate": 3.378688524590164e-05, "loss": 0.5023, "num_tokens": 603125758.0, "step": 3985 }, { "epoch": 1.1773384479197404, "grad_norm": 0.27088346112828066, "learning_rate": 3.3759562841530056e-05, "loss": 0.4769, "num_tokens": 603872457.0, "step": 3990 }, { "epoch": 1.178813809383299, "grad_norm": 0.27564777812526475, "learning_rate": 3.3732240437158466e-05, "loss": 0.5117, "num_tokens": 604637697.0, "step": 3995 }, { "epoch": 1.1802891708468575, "grad_norm": 0.24815156387633802, "learning_rate": 3.370491803278689e-05, "loss": 0.4761, "num_tokens": 605407634.0, "step": 4000 }, { "epoch": 1.181764532310416, "grad_norm": 0.27745601541292375, "learning_rate": 3.3677595628415306e-05, "loss": 0.4843, "num_tokens": 606156983.0, "step": 4005 }, { "epoch": 1.1832398937739745, "grad_norm": 0.27898978064897045, "learning_rate": 3.3650273224043716e-05, "loss": 0.5132, "num_tokens": 606932634.0, "step": 4010 }, { "epoch": 1.184715255237533, "grad_norm": 0.26442231484425444, "learning_rate": 3.362295081967213e-05, "loss": 0.4808, "num_tokens": 607656525.0, "step": 4015 }, { "epoch": 1.1861906167010918, "grad_norm": 0.2510997224820333, "learning_rate": 3.359562841530055e-05, "loss": 0.4974, "num_tokens": 608414752.0, "step": 4020 }, { "epoch": 1.1876659781646504, "grad_norm": 0.27205906876657987, "learning_rate": 3.356830601092896e-05, "loss": 0.5044, "num_tokens": 609209403.0, "step": 4025 }, { "epoch": 1.189141339628209, "grad_norm": 0.25393522495477744, "learning_rate": 3.354098360655738e-05, "loss": 0.4904, "num_tokens": 609924411.0, "step": 4030 }, { "epoch": 1.1906167010917674, "grad_norm": 0.2645474351636496, "learning_rate": 3.351366120218579e-05, "loss": 0.4943, "num_tokens": 610660240.0, "step": 4035 }, { "epoch": 1.192092062555326, "grad_norm": 0.3026917751866186, "learning_rate": 3.348633879781421e-05, "loss": 0.5066, "num_tokens": 611448181.0, "step": 4040 }, { "epoch": 1.1935674240188847, "grad_norm": 0.2892798349330594, "learning_rate": 3.3459016393442626e-05, "loss": 0.5138, "num_tokens": 612274767.0, "step": 4045 }, { "epoch": 1.1950427854824432, "grad_norm": 0.2939525869163098, "learning_rate": 3.3431693989071036e-05, "loss": 0.4937, "num_tokens": 613011470.0, "step": 4050 }, { "epoch": 1.1965181469460018, "grad_norm": 0.26851669300296854, "learning_rate": 3.340437158469946e-05, "loss": 0.4915, "num_tokens": 613768320.0, "step": 4055 }, { "epoch": 1.1979935084095603, "grad_norm": 0.2894216238768941, "learning_rate": 3.337704918032787e-05, "loss": 0.5087, "num_tokens": 614528309.0, "step": 4060 }, { "epoch": 1.1994688698731188, "grad_norm": 0.27650245036006266, "learning_rate": 3.3349726775956286e-05, "loss": 0.4798, "num_tokens": 615206081.0, "step": 4065 }, { "epoch": 1.2009442313366776, "grad_norm": 0.28735188777931436, "learning_rate": 3.33224043715847e-05, "loss": 0.5018, "num_tokens": 615984089.0, "step": 4070 }, { "epoch": 1.2024195928002361, "grad_norm": 0.2740163680906128, "learning_rate": 3.329508196721311e-05, "loss": 0.4807, "num_tokens": 616662058.0, "step": 4075 }, { "epoch": 1.2038949542637947, "grad_norm": 0.2565770383405558, "learning_rate": 3.326775956284153e-05, "loss": 0.4942, "num_tokens": 617507187.0, "step": 4080 }, { "epoch": 1.2053703157273532, "grad_norm": 0.24220898954125136, "learning_rate": 3.3240437158469946e-05, "loss": 0.4786, "num_tokens": 618232452.0, "step": 4085 }, { "epoch": 1.2068456771909117, "grad_norm": 0.26926614829311013, "learning_rate": 3.321311475409836e-05, "loss": 0.4758, "num_tokens": 618942723.0, "step": 4090 }, { "epoch": 1.2083210386544703, "grad_norm": 0.24681245504148683, "learning_rate": 3.318579234972678e-05, "loss": 0.5, "num_tokens": 619750976.0, "step": 4095 }, { "epoch": 1.2097964001180288, "grad_norm": 0.2583622783974962, "learning_rate": 3.315846994535519e-05, "loss": 0.4768, "num_tokens": 620485780.0, "step": 4100 }, { "epoch": 1.2112717615815876, "grad_norm": 0.26510709775011926, "learning_rate": 3.3131147540983606e-05, "loss": 0.4877, "num_tokens": 621213555.0, "step": 4105 }, { "epoch": 1.212747123045146, "grad_norm": 0.2775480125437585, "learning_rate": 3.310382513661202e-05, "loss": 0.4897, "num_tokens": 621941106.0, "step": 4110 }, { "epoch": 1.2142224845087046, "grad_norm": 0.24470725926326659, "learning_rate": 3.307650273224044e-05, "loss": 0.5067, "num_tokens": 622726584.0, "step": 4115 }, { "epoch": 1.2156978459722632, "grad_norm": 0.2623319554496149, "learning_rate": 3.3049180327868857e-05, "loss": 0.4661, "num_tokens": 623424324.0, "step": 4120 }, { "epoch": 1.2171732074358217, "grad_norm": 0.26176314244850246, "learning_rate": 3.3021857923497266e-05, "loss": 0.5034, "num_tokens": 624218228.0, "step": 4125 }, { "epoch": 1.2186485688993804, "grad_norm": 0.26305795393649695, "learning_rate": 3.299453551912568e-05, "loss": 0.5019, "num_tokens": 625012495.0, "step": 4130 }, { "epoch": 1.220123930362939, "grad_norm": 0.25836206381609705, "learning_rate": 3.29672131147541e-05, "loss": 0.4794, "num_tokens": 625730025.0, "step": 4135 }, { "epoch": 1.2215992918264975, "grad_norm": 0.2715471466961357, "learning_rate": 3.2939890710382517e-05, "loss": 0.4862, "num_tokens": 626491475.0, "step": 4140 }, { "epoch": 1.223074653290056, "grad_norm": 0.2943076796683708, "learning_rate": 3.291256830601093e-05, "loss": 0.4901, "num_tokens": 627247840.0, "step": 4145 }, { "epoch": 1.2245500147536146, "grad_norm": 0.27890589400564203, "learning_rate": 3.288524590163934e-05, "loss": 0.488, "num_tokens": 628014963.0, "step": 4150 }, { "epoch": 1.2260253762171731, "grad_norm": 0.25170319861726326, "learning_rate": 3.285792349726776e-05, "loss": 0.49, "num_tokens": 628780154.0, "step": 4155 }, { "epoch": 1.2275007376807319, "grad_norm": 0.2603804238275981, "learning_rate": 3.283060109289618e-05, "loss": 0.499, "num_tokens": 629552894.0, "step": 4160 }, { "epoch": 1.2289760991442904, "grad_norm": 0.28435809609758417, "learning_rate": 3.280327868852459e-05, "loss": 0.5179, "num_tokens": 630348558.0, "step": 4165 }, { "epoch": 1.230451460607849, "grad_norm": 0.26300028406379244, "learning_rate": 3.277595628415301e-05, "loss": 0.487, "num_tokens": 631097609.0, "step": 4170 }, { "epoch": 1.2319268220714075, "grad_norm": 0.2659303889001788, "learning_rate": 3.274863387978142e-05, "loss": 0.4935, "num_tokens": 631873663.0, "step": 4175 }, { "epoch": 1.233402183534966, "grad_norm": 0.2585557204448752, "learning_rate": 3.272131147540984e-05, "loss": 0.4884, "num_tokens": 632625132.0, "step": 4180 }, { "epoch": 1.2348775449985245, "grad_norm": 0.2598684416299142, "learning_rate": 3.2693989071038253e-05, "loss": 0.5057, "num_tokens": 633385181.0, "step": 4185 }, { "epoch": 1.2363529064620833, "grad_norm": 0.29386552446871556, "learning_rate": 3.266666666666667e-05, "loss": 0.5115, "num_tokens": 634187077.0, "step": 4190 }, { "epoch": 1.2378282679256418, "grad_norm": 0.27265072501577015, "learning_rate": 3.263934426229509e-05, "loss": 0.4957, "num_tokens": 634921309.0, "step": 4195 }, { "epoch": 1.2393036293892004, "grad_norm": 0.24666714044701873, "learning_rate": 3.26120218579235e-05, "loss": 0.4872, "num_tokens": 635664033.0, "step": 4200 }, { "epoch": 1.240778990852759, "grad_norm": 0.2664012133069112, "learning_rate": 3.2584699453551913e-05, "loss": 0.4923, "num_tokens": 636447296.0, "step": 4205 }, { "epoch": 1.2422543523163174, "grad_norm": 0.24806277784732714, "learning_rate": 3.255737704918033e-05, "loss": 0.4858, "num_tokens": 637183088.0, "step": 4210 }, { "epoch": 1.2437297137798762, "grad_norm": 0.262949325551082, "learning_rate": 3.253005464480875e-05, "loss": 0.5095, "num_tokens": 637957936.0, "step": 4215 }, { "epoch": 1.2452050752434347, "grad_norm": 0.30605419122528094, "learning_rate": 3.2502732240437164e-05, "loss": 0.508, "num_tokens": 638769851.0, "step": 4220 }, { "epoch": 1.2466804367069932, "grad_norm": 0.2826577684984029, "learning_rate": 3.2475409836065574e-05, "loss": 0.4857, "num_tokens": 639456250.0, "step": 4225 }, { "epoch": 1.2481557981705518, "grad_norm": 0.2670105984603605, "learning_rate": 3.244808743169399e-05, "loss": 0.4988, "num_tokens": 640229963.0, "step": 4230 }, { "epoch": 1.2496311596341103, "grad_norm": 0.2616416141278644, "learning_rate": 3.242076502732241e-05, "loss": 0.4826, "num_tokens": 640950887.0, "step": 4235 }, { "epoch": 1.251106521097669, "grad_norm": 0.25492125303496377, "learning_rate": 3.2393442622950824e-05, "loss": 0.4626, "num_tokens": 641632302.0, "step": 4240 }, { "epoch": 1.2525818825612274, "grad_norm": 0.25849537278723544, "learning_rate": 3.236612021857924e-05, "loss": 0.4969, "num_tokens": 642428048.0, "step": 4245 }, { "epoch": 1.2540572440247861, "grad_norm": 0.26464828343151986, "learning_rate": 3.233879781420765e-05, "loss": 0.5042, "num_tokens": 643212268.0, "step": 4250 }, { "epoch": 1.2555326054883447, "grad_norm": 0.23530510299320867, "learning_rate": 3.231147540983607e-05, "loss": 0.4979, "num_tokens": 644013849.0, "step": 4255 }, { "epoch": 1.2570079669519032, "grad_norm": 0.24854696968458145, "learning_rate": 3.2284153005464484e-05, "loss": 0.4973, "num_tokens": 644774381.0, "step": 4260 }, { "epoch": 1.2584833284154617, "grad_norm": 0.2640585618307019, "learning_rate": 3.2256830601092894e-05, "loss": 0.4655, "num_tokens": 645506986.0, "step": 4265 }, { "epoch": 1.2599586898790203, "grad_norm": 0.3041934280445341, "learning_rate": 3.222950819672132e-05, "loss": 0.4882, "num_tokens": 646262153.0, "step": 4270 }, { "epoch": 1.261434051342579, "grad_norm": 0.2750932448575937, "learning_rate": 3.220218579234973e-05, "loss": 0.4951, "num_tokens": 647002305.0, "step": 4275 }, { "epoch": 1.2629094128061376, "grad_norm": 0.27457222918745383, "learning_rate": 3.2174863387978144e-05, "loss": 0.4797, "num_tokens": 647770335.0, "step": 4280 }, { "epoch": 1.264384774269696, "grad_norm": 0.2584405052421453, "learning_rate": 3.214754098360656e-05, "loss": 0.4952, "num_tokens": 648519477.0, "step": 4285 }, { "epoch": 1.2658601357332546, "grad_norm": 0.2571008929666446, "learning_rate": 3.212021857923497e-05, "loss": 0.5015, "num_tokens": 649252286.0, "step": 4290 }, { "epoch": 1.2673354971968132, "grad_norm": 0.2916351052040045, "learning_rate": 3.209289617486339e-05, "loss": 0.4857, "num_tokens": 649966064.0, "step": 4295 }, { "epoch": 1.268810858660372, "grad_norm": 0.26244703010547626, "learning_rate": 3.2065573770491804e-05, "loss": 0.4788, "num_tokens": 650704896.0, "step": 4300 }, { "epoch": 1.2702862201239304, "grad_norm": 0.2503217367782858, "learning_rate": 3.203825136612022e-05, "loss": 0.4917, "num_tokens": 651506777.0, "step": 4305 }, { "epoch": 1.271761581587489, "grad_norm": 0.24693859695583642, "learning_rate": 3.201092896174864e-05, "loss": 0.5063, "num_tokens": 652267719.0, "step": 4310 }, { "epoch": 1.2732369430510475, "grad_norm": 0.2531599428715662, "learning_rate": 3.198360655737705e-05, "loss": 0.4897, "num_tokens": 653008936.0, "step": 4315 }, { "epoch": 1.274712304514606, "grad_norm": 0.2541117908488436, "learning_rate": 3.1956284153005464e-05, "loss": 0.4917, "num_tokens": 653750616.0, "step": 4320 }, { "epoch": 1.2761876659781646, "grad_norm": 0.2798346906500383, "learning_rate": 3.192896174863388e-05, "loss": 0.4897, "num_tokens": 654477689.0, "step": 4325 }, { "epoch": 1.277663027441723, "grad_norm": 0.28058103205630597, "learning_rate": 3.19016393442623e-05, "loss": 0.485, "num_tokens": 655247723.0, "step": 4330 }, { "epoch": 1.2791383889052819, "grad_norm": 0.2638725294714555, "learning_rate": 3.1874316939890714e-05, "loss": 0.495, "num_tokens": 656017546.0, "step": 4335 }, { "epoch": 1.2806137503688404, "grad_norm": 0.31293512062906825, "learning_rate": 3.1846994535519124e-05, "loss": 0.4886, "num_tokens": 656723421.0, "step": 4340 }, { "epoch": 1.282089111832399, "grad_norm": 0.26727575907493895, "learning_rate": 3.181967213114754e-05, "loss": 0.4886, "num_tokens": 657473219.0, "step": 4345 }, { "epoch": 1.2835644732959575, "grad_norm": 0.2557541335332184, "learning_rate": 3.179234972677596e-05, "loss": 0.5006, "num_tokens": 658241548.0, "step": 4350 }, { "epoch": 1.285039834759516, "grad_norm": 0.26628831270415343, "learning_rate": 3.1765027322404374e-05, "loss": 0.481, "num_tokens": 658993798.0, "step": 4355 }, { "epoch": 1.2865151962230748, "grad_norm": 0.27189577147453486, "learning_rate": 3.173770491803279e-05, "loss": 0.4953, "num_tokens": 659787800.0, "step": 4360 }, { "epoch": 1.2879905576866333, "grad_norm": 0.26778505087099946, "learning_rate": 3.17103825136612e-05, "loss": 0.4808, "num_tokens": 660546018.0, "step": 4365 }, { "epoch": 1.2894659191501918, "grad_norm": 0.3358898863923751, "learning_rate": 3.168306010928962e-05, "loss": 0.498, "num_tokens": 661326354.0, "step": 4370 }, { "epoch": 1.2909412806137504, "grad_norm": 0.26700880864737203, "learning_rate": 3.1655737704918034e-05, "loss": 0.4756, "num_tokens": 662071570.0, "step": 4375 }, { "epoch": 1.2924166420773089, "grad_norm": 0.25887765751141256, "learning_rate": 3.162841530054645e-05, "loss": 0.5011, "num_tokens": 662811637.0, "step": 4380 }, { "epoch": 1.2938920035408676, "grad_norm": 0.24704215981941166, "learning_rate": 3.160109289617487e-05, "loss": 0.4916, "num_tokens": 663581810.0, "step": 4385 }, { "epoch": 1.295367365004426, "grad_norm": 0.28623662525705695, "learning_rate": 3.157377049180328e-05, "loss": 0.4869, "num_tokens": 664321806.0, "step": 4390 }, { "epoch": 1.2968427264679847, "grad_norm": 0.26889335109054247, "learning_rate": 3.1546448087431694e-05, "loss": 0.4955, "num_tokens": 665027447.0, "step": 4395 }, { "epoch": 1.2983180879315432, "grad_norm": 0.2721839621653255, "learning_rate": 3.151912568306011e-05, "loss": 0.4897, "num_tokens": 665752631.0, "step": 4400 }, { "epoch": 1.2997934493951018, "grad_norm": 0.2963124115583121, "learning_rate": 3.149180327868853e-05, "loss": 0.4858, "num_tokens": 666500788.0, "step": 4405 }, { "epoch": 1.3012688108586603, "grad_norm": 0.24341839868846, "learning_rate": 3.1464480874316944e-05, "loss": 0.5049, "num_tokens": 667285221.0, "step": 4410 }, { "epoch": 1.3027441723222188, "grad_norm": 0.2563097510817125, "learning_rate": 3.1437158469945354e-05, "loss": 0.4754, "num_tokens": 668003973.0, "step": 4415 }, { "epoch": 1.3042195337857776, "grad_norm": 0.25065745490188385, "learning_rate": 3.140983606557377e-05, "loss": 0.4938, "num_tokens": 668740097.0, "step": 4420 }, { "epoch": 1.3056948952493361, "grad_norm": 0.2569580465333069, "learning_rate": 3.138251366120219e-05, "loss": 0.5033, "num_tokens": 669432619.0, "step": 4425 }, { "epoch": 1.3071702567128947, "grad_norm": 0.24892111674685685, "learning_rate": 3.1355191256830604e-05, "loss": 0.4933, "num_tokens": 670132771.0, "step": 4430 }, { "epoch": 1.3086456181764532, "grad_norm": 0.2573035269001245, "learning_rate": 3.132786885245902e-05, "loss": 0.485, "num_tokens": 670846198.0, "step": 4435 }, { "epoch": 1.3101209796400117, "grad_norm": 0.25840016923969233, "learning_rate": 3.130054644808743e-05, "loss": 0.4856, "num_tokens": 671586462.0, "step": 4440 }, { "epoch": 1.3115963411035705, "grad_norm": 0.2407352311544983, "learning_rate": 3.127322404371585e-05, "loss": 0.4973, "num_tokens": 672332993.0, "step": 4445 }, { "epoch": 1.313071702567129, "grad_norm": 0.2552124153493733, "learning_rate": 3.1245901639344265e-05, "loss": 0.4817, "num_tokens": 673056211.0, "step": 4450 }, { "epoch": 1.3145470640306876, "grad_norm": 0.2608634490264722, "learning_rate": 3.121857923497268e-05, "loss": 0.4859, "num_tokens": 673813684.0, "step": 4455 }, { "epoch": 1.316022425494246, "grad_norm": 0.24342856115964442, "learning_rate": 3.11912568306011e-05, "loss": 0.494, "num_tokens": 674599039.0, "step": 4460 }, { "epoch": 1.3174977869578046, "grad_norm": 0.2615707361542615, "learning_rate": 3.116393442622951e-05, "loss": 0.5116, "num_tokens": 675359751.0, "step": 4465 }, { "epoch": 1.3189731484213634, "grad_norm": 0.26261009930671425, "learning_rate": 3.1136612021857925e-05, "loss": 0.4848, "num_tokens": 676089656.0, "step": 4470 }, { "epoch": 1.3204485098849217, "grad_norm": 0.2444729690841613, "learning_rate": 3.110928961748634e-05, "loss": 0.4955, "num_tokens": 676866941.0, "step": 4475 }, { "epoch": 1.3219238713484804, "grad_norm": 0.2651945808429223, "learning_rate": 3.108196721311475e-05, "loss": 0.5001, "num_tokens": 677637104.0, "step": 4480 }, { "epoch": 1.323399232812039, "grad_norm": 0.260598590534948, "learning_rate": 3.1054644808743175e-05, "loss": 0.4853, "num_tokens": 678388861.0, "step": 4485 }, { "epoch": 1.3248745942755975, "grad_norm": 0.2492310081346869, "learning_rate": 3.1027322404371585e-05, "loss": 0.5041, "num_tokens": 679190466.0, "step": 4490 }, { "epoch": 1.326349955739156, "grad_norm": 0.2833950709172876, "learning_rate": 3.1e-05, "loss": 0.4799, "num_tokens": 679885195.0, "step": 4495 }, { "epoch": 1.3278253172027146, "grad_norm": 0.27612399669001825, "learning_rate": 3.097267759562842e-05, "loss": 0.4936, "num_tokens": 680598449.0, "step": 4500 }, { "epoch": 1.3293006786662733, "grad_norm": 0.27237218416529224, "learning_rate": 3.094535519125683e-05, "loss": 0.5059, "num_tokens": 681447836.0, "step": 4505 }, { "epoch": 1.3307760401298319, "grad_norm": 0.24251550825303356, "learning_rate": 3.091803278688525e-05, "loss": 0.4897, "num_tokens": 682209182.0, "step": 4510 }, { "epoch": 1.3322514015933904, "grad_norm": 0.2773776408495967, "learning_rate": 3.089071038251366e-05, "loss": 0.4923, "num_tokens": 682979457.0, "step": 4515 }, { "epoch": 1.333726763056949, "grad_norm": 0.2544767851294298, "learning_rate": 3.086338797814208e-05, "loss": 0.5019, "num_tokens": 683784218.0, "step": 4520 }, { "epoch": 1.3352021245205075, "grad_norm": 0.27869616733337715, "learning_rate": 3.0836065573770495e-05, "loss": 0.4874, "num_tokens": 684537347.0, "step": 4525 }, { "epoch": 1.3366774859840662, "grad_norm": 0.2636207711383969, "learning_rate": 3.0808743169398905e-05, "loss": 0.4942, "num_tokens": 685299483.0, "step": 4530 }, { "epoch": 1.3381528474476247, "grad_norm": 0.26622702510900204, "learning_rate": 3.078142076502732e-05, "loss": 0.4951, "num_tokens": 686065423.0, "step": 4535 }, { "epoch": 1.3396282089111833, "grad_norm": 0.2641060913719243, "learning_rate": 3.075409836065574e-05, "loss": 0.4898, "num_tokens": 686827059.0, "step": 4540 }, { "epoch": 1.3411035703747418, "grad_norm": 0.26374375799820143, "learning_rate": 3.0726775956284155e-05, "loss": 0.4818, "num_tokens": 687560266.0, "step": 4545 }, { "epoch": 1.3425789318383003, "grad_norm": 0.2618666153944001, "learning_rate": 3.069945355191257e-05, "loss": 0.4861, "num_tokens": 688296990.0, "step": 4550 }, { "epoch": 1.3440542933018589, "grad_norm": 0.2494978606380361, "learning_rate": 3.067213114754098e-05, "loss": 0.4833, "num_tokens": 689024745.0, "step": 4555 }, { "epoch": 1.3455296547654174, "grad_norm": 0.26693884364665293, "learning_rate": 3.06448087431694e-05, "loss": 0.5045, "num_tokens": 689829005.0, "step": 4560 }, { "epoch": 1.3470050162289762, "grad_norm": 0.2969623998772953, "learning_rate": 3.061748633879782e-05, "loss": 0.4764, "num_tokens": 690496051.0, "step": 4565 }, { "epoch": 1.3484803776925347, "grad_norm": 0.2506793270886518, "learning_rate": 3.059016393442623e-05, "loss": 0.4971, "num_tokens": 691280299.0, "step": 4570 }, { "epoch": 1.3499557391560932, "grad_norm": 0.2595891210519697, "learning_rate": 3.056284153005465e-05, "loss": 0.4854, "num_tokens": 692069879.0, "step": 4575 }, { "epoch": 1.3514311006196518, "grad_norm": 0.2714993056861188, "learning_rate": 3.053551912568306e-05, "loss": 0.4932, "num_tokens": 692801990.0, "step": 4580 }, { "epoch": 1.3529064620832103, "grad_norm": 0.24951847275711506, "learning_rate": 3.050819672131148e-05, "loss": 0.4807, "num_tokens": 693574596.0, "step": 4585 }, { "epoch": 1.354381823546769, "grad_norm": 0.25371539576258656, "learning_rate": 3.0480874316939895e-05, "loss": 0.4995, "num_tokens": 694353595.0, "step": 4590 }, { "epoch": 1.3558571850103276, "grad_norm": 0.2489207195028011, "learning_rate": 3.0453551912568305e-05, "loss": 0.482, "num_tokens": 695085256.0, "step": 4595 }, { "epoch": 1.3573325464738861, "grad_norm": 0.2518859363441992, "learning_rate": 3.0426229508196725e-05, "loss": 0.4762, "num_tokens": 695827510.0, "step": 4600 }, { "epoch": 1.3588079079374447, "grad_norm": 0.27530094318052356, "learning_rate": 3.0398907103825135e-05, "loss": 0.4731, "num_tokens": 696524987.0, "step": 4605 }, { "epoch": 1.3602832694010032, "grad_norm": 0.27884519482094844, "learning_rate": 3.0371584699453555e-05, "loss": 0.4775, "num_tokens": 697237822.0, "step": 4610 }, { "epoch": 1.361758630864562, "grad_norm": 0.2635071269320732, "learning_rate": 3.0344262295081972e-05, "loss": 0.4793, "num_tokens": 697968541.0, "step": 4615 }, { "epoch": 1.3632339923281203, "grad_norm": 0.2521305197966613, "learning_rate": 3.0316939890710382e-05, "loss": 0.4961, "num_tokens": 698754589.0, "step": 4620 }, { "epoch": 1.364709353791679, "grad_norm": 0.2745915326604561, "learning_rate": 3.0289617486338802e-05, "loss": 0.5035, "num_tokens": 699507647.0, "step": 4625 }, { "epoch": 1.3661847152552375, "grad_norm": 0.26838760149404717, "learning_rate": 3.0262295081967212e-05, "loss": 0.4948, "num_tokens": 700293751.0, "step": 4630 }, { "epoch": 1.367660076718796, "grad_norm": 0.27062173378465304, "learning_rate": 3.023497267759563e-05, "loss": 0.4929, "num_tokens": 701021503.0, "step": 4635 }, { "epoch": 1.3691354381823546, "grad_norm": 0.25419345993602316, "learning_rate": 3.020765027322405e-05, "loss": 0.503, "num_tokens": 701766660.0, "step": 4640 }, { "epoch": 1.3706107996459131, "grad_norm": 0.2568262894702245, "learning_rate": 3.018032786885246e-05, "loss": 0.4834, "num_tokens": 702543327.0, "step": 4645 }, { "epoch": 1.372086161109472, "grad_norm": 0.24239831595004305, "learning_rate": 3.0153005464480875e-05, "loss": 0.4802, "num_tokens": 703298097.0, "step": 4650 }, { "epoch": 1.3735615225730304, "grad_norm": 0.2721658341434883, "learning_rate": 3.012568306010929e-05, "loss": 0.5032, "num_tokens": 704084727.0, "step": 4655 }, { "epoch": 1.375036884036589, "grad_norm": 0.2552680599604381, "learning_rate": 3.0098360655737705e-05, "loss": 0.4872, "num_tokens": 704839995.0, "step": 4660 }, { "epoch": 1.3765122455001475, "grad_norm": 0.2511659161973677, "learning_rate": 3.0071038251366122e-05, "loss": 0.488, "num_tokens": 705573530.0, "step": 4665 }, { "epoch": 1.377987606963706, "grad_norm": 0.2677676292559146, "learning_rate": 3.0043715846994535e-05, "loss": 0.4847, "num_tokens": 706312336.0, "step": 4670 }, { "epoch": 1.3794629684272648, "grad_norm": 0.2723660851229304, "learning_rate": 3.0016393442622952e-05, "loss": 0.4816, "num_tokens": 707013943.0, "step": 4675 }, { "epoch": 1.3809383298908233, "grad_norm": 0.2352258484315079, "learning_rate": 2.9989071038251365e-05, "loss": 0.4885, "num_tokens": 707792937.0, "step": 4680 }, { "epoch": 1.3824136913543819, "grad_norm": 0.24826537087710776, "learning_rate": 2.9961748633879782e-05, "loss": 0.482, "num_tokens": 708562576.0, "step": 4685 }, { "epoch": 1.3838890528179404, "grad_norm": 0.2601005060971761, "learning_rate": 2.99344262295082e-05, "loss": 0.4936, "num_tokens": 709310727.0, "step": 4690 }, { "epoch": 1.385364414281499, "grad_norm": 0.2495928683442421, "learning_rate": 2.9907103825136612e-05, "loss": 0.4951, "num_tokens": 710093149.0, "step": 4695 }, { "epoch": 1.3868397757450575, "grad_norm": 0.2601992287606837, "learning_rate": 2.987978142076503e-05, "loss": 0.5064, "num_tokens": 710887619.0, "step": 4700 }, { "epoch": 1.388315137208616, "grad_norm": 0.25895314556985866, "learning_rate": 2.9852459016393442e-05, "loss": 0.4899, "num_tokens": 711644305.0, "step": 4705 }, { "epoch": 1.3897904986721747, "grad_norm": 0.2507169934349053, "learning_rate": 2.982513661202186e-05, "loss": 0.4863, "num_tokens": 712410538.0, "step": 4710 }, { "epoch": 1.3912658601357333, "grad_norm": 0.28566372256595934, "learning_rate": 2.9797814207650276e-05, "loss": 0.4805, "num_tokens": 713141829.0, "step": 4715 }, { "epoch": 1.3927412215992918, "grad_norm": 0.2742846469913542, "learning_rate": 2.977049180327869e-05, "loss": 0.4874, "num_tokens": 713945048.0, "step": 4720 }, { "epoch": 1.3942165830628503, "grad_norm": 0.25615541786152324, "learning_rate": 2.9743169398907106e-05, "loss": 0.4871, "num_tokens": 714641564.0, "step": 4725 }, { "epoch": 1.3956919445264089, "grad_norm": 0.25889966187032937, "learning_rate": 2.971584699453552e-05, "loss": 0.5004, "num_tokens": 715460102.0, "step": 4730 }, { "epoch": 1.3971673059899676, "grad_norm": 0.24453014657695182, "learning_rate": 2.9688524590163936e-05, "loss": 0.4945, "num_tokens": 716210325.0, "step": 4735 }, { "epoch": 1.3986426674535262, "grad_norm": 0.28169954935620584, "learning_rate": 2.9661202185792352e-05, "loss": 0.4794, "num_tokens": 716953698.0, "step": 4740 }, { "epoch": 1.4001180289170847, "grad_norm": 0.27463454340401794, "learning_rate": 2.9633879781420766e-05, "loss": 0.4849, "num_tokens": 717706681.0, "step": 4745 }, { "epoch": 1.4015933903806432, "grad_norm": 0.2619159177109216, "learning_rate": 2.9606557377049182e-05, "loss": 0.5086, "num_tokens": 718530442.0, "step": 4750 }, { "epoch": 1.4030687518442018, "grad_norm": 0.28462077002185043, "learning_rate": 2.95792349726776e-05, "loss": 0.4969, "num_tokens": 719299261.0, "step": 4755 }, { "epoch": 1.4045441133077605, "grad_norm": 0.24863771905121346, "learning_rate": 2.9551912568306012e-05, "loss": 0.484, "num_tokens": 720031707.0, "step": 4760 }, { "epoch": 1.406019474771319, "grad_norm": 0.2523593846189948, "learning_rate": 2.952459016393443e-05, "loss": 0.4925, "num_tokens": 720821169.0, "step": 4765 }, { "epoch": 1.4074948362348776, "grad_norm": 0.24566019277011833, "learning_rate": 2.9497267759562842e-05, "loss": 0.4993, "num_tokens": 721570885.0, "step": 4770 }, { "epoch": 1.4089701976984361, "grad_norm": 0.25197934019753, "learning_rate": 2.946994535519126e-05, "loss": 0.5057, "num_tokens": 722339898.0, "step": 4775 }, { "epoch": 1.4104455591619947, "grad_norm": 0.24735918758934955, "learning_rate": 2.9442622950819676e-05, "loss": 0.5073, "num_tokens": 723186699.0, "step": 4780 }, { "epoch": 1.4119209206255532, "grad_norm": 0.25390850161541284, "learning_rate": 2.941530054644809e-05, "loss": 0.4917, "num_tokens": 723967109.0, "step": 4785 }, { "epoch": 1.4133962820891117, "grad_norm": 0.3596723102406904, "learning_rate": 2.9387978142076506e-05, "loss": 0.4598, "num_tokens": 724626400.0, "step": 4790 }, { "epoch": 1.4148716435526705, "grad_norm": 0.25423990011811226, "learning_rate": 2.9360655737704916e-05, "loss": 0.4896, "num_tokens": 725342742.0, "step": 4795 }, { "epoch": 1.416347005016229, "grad_norm": 0.27419964118065215, "learning_rate": 2.9333333333333336e-05, "loss": 0.4773, "num_tokens": 726072339.0, "step": 4800 }, { "epoch": 1.4178223664797875, "grad_norm": 0.24896110700386695, "learning_rate": 2.9306010928961753e-05, "loss": 0.5007, "num_tokens": 726847174.0, "step": 4805 }, { "epoch": 1.419297727943346, "grad_norm": 0.26431908270467147, "learning_rate": 2.9278688524590166e-05, "loss": 0.4845, "num_tokens": 727597107.0, "step": 4810 }, { "epoch": 1.4207730894069046, "grad_norm": 0.23200723102232376, "learning_rate": 2.9251366120218583e-05, "loss": 0.5062, "num_tokens": 728414631.0, "step": 4815 }, { "epoch": 1.4222484508704634, "grad_norm": 0.2593619834020515, "learning_rate": 2.9224043715846993e-05, "loss": 0.5016, "num_tokens": 729210849.0, "step": 4820 }, { "epoch": 1.423723812334022, "grad_norm": 0.25826118554694494, "learning_rate": 2.9196721311475413e-05, "loss": 0.4847, "num_tokens": 729948701.0, "step": 4825 }, { "epoch": 1.4251991737975804, "grad_norm": 0.2770089070814888, "learning_rate": 2.916939890710383e-05, "loss": 0.4937, "num_tokens": 730719651.0, "step": 4830 }, { "epoch": 1.426674535261139, "grad_norm": 0.26300970547552743, "learning_rate": 2.914207650273224e-05, "loss": 0.4935, "num_tokens": 731493674.0, "step": 4835 }, { "epoch": 1.4281498967246975, "grad_norm": 0.26055869220139727, "learning_rate": 2.911475409836066e-05, "loss": 0.5011, "num_tokens": 732230314.0, "step": 4840 }, { "epoch": 1.4296252581882563, "grad_norm": 0.25285529121567957, "learning_rate": 2.908743169398907e-05, "loss": 0.5073, "num_tokens": 733071218.0, "step": 4845 }, { "epoch": 1.4311006196518146, "grad_norm": 0.26249340772262225, "learning_rate": 2.9060109289617486e-05, "loss": 0.4741, "num_tokens": 733790036.0, "step": 4850 }, { "epoch": 1.4325759811153733, "grad_norm": 0.26640734767314683, "learning_rate": 2.9032786885245906e-05, "loss": 0.499, "num_tokens": 734525412.0, "step": 4855 }, { "epoch": 1.4340513425789319, "grad_norm": 0.2865891475764457, "learning_rate": 2.9005464480874316e-05, "loss": 0.4942, "num_tokens": 735289438.0, "step": 4860 }, { "epoch": 1.4355267040424904, "grad_norm": 0.2941176970999025, "learning_rate": 2.8978142076502733e-05, "loss": 0.4924, "num_tokens": 736014996.0, "step": 4865 }, { "epoch": 1.437002065506049, "grad_norm": 0.2676697020262545, "learning_rate": 2.8950819672131146e-05, "loss": 0.493, "num_tokens": 736760790.0, "step": 4870 }, { "epoch": 1.4384774269696075, "grad_norm": 0.2957182414703914, "learning_rate": 2.8923497267759563e-05, "loss": 0.4869, "num_tokens": 737513735.0, "step": 4875 }, { "epoch": 1.4399527884331662, "grad_norm": 0.2421186286876067, "learning_rate": 2.8896174863387983e-05, "loss": 0.4819, "num_tokens": 738290669.0, "step": 4880 }, { "epoch": 1.4414281498967247, "grad_norm": 0.24953056028563217, "learning_rate": 2.8868852459016393e-05, "loss": 0.4794, "num_tokens": 739031199.0, "step": 4885 }, { "epoch": 1.4429035113602833, "grad_norm": 0.24058553365952356, "learning_rate": 2.884153005464481e-05, "loss": 0.4811, "num_tokens": 739714298.0, "step": 4890 }, { "epoch": 1.4443788728238418, "grad_norm": 0.25709858482529324, "learning_rate": 2.8814207650273223e-05, "loss": 0.4845, "num_tokens": 740469792.0, "step": 4895 }, { "epoch": 1.4458542342874003, "grad_norm": 0.2616168653789251, "learning_rate": 2.878688524590164e-05, "loss": 0.4947, "num_tokens": 741183909.0, "step": 4900 }, { "epoch": 1.447329595750959, "grad_norm": 0.26336766890349567, "learning_rate": 2.8759562841530056e-05, "loss": 0.4786, "num_tokens": 741943472.0, "step": 4905 }, { "epoch": 1.4488049572145176, "grad_norm": 0.2530319858069693, "learning_rate": 2.873224043715847e-05, "loss": 0.467, "num_tokens": 742633112.0, "step": 4910 }, { "epoch": 1.4502803186780762, "grad_norm": 0.24962999825038756, "learning_rate": 2.8704918032786886e-05, "loss": 0.4951, "num_tokens": 743486925.0, "step": 4915 }, { "epoch": 1.4517556801416347, "grad_norm": 0.2545820541920913, "learning_rate": 2.8677595628415303e-05, "loss": 0.497, "num_tokens": 744304803.0, "step": 4920 }, { "epoch": 1.4532310416051932, "grad_norm": 0.24543405084419587, "learning_rate": 2.8650273224043716e-05, "loss": 0.4792, "num_tokens": 745091288.0, "step": 4925 }, { "epoch": 1.4547064030687518, "grad_norm": 0.23107444974442679, "learning_rate": 2.8622950819672133e-05, "loss": 0.4893, "num_tokens": 745870236.0, "step": 4930 }, { "epoch": 1.4561817645323103, "grad_norm": 0.24422896600095237, "learning_rate": 2.8595628415300546e-05, "loss": 0.4797, "num_tokens": 746605506.0, "step": 4935 }, { "epoch": 1.457657125995869, "grad_norm": 0.2444356198686661, "learning_rate": 2.8568306010928963e-05, "loss": 0.5099, "num_tokens": 747425258.0, "step": 4940 }, { "epoch": 1.4591324874594276, "grad_norm": 0.2692745020853461, "learning_rate": 2.854098360655738e-05, "loss": 0.4757, "num_tokens": 748199561.0, "step": 4945 }, { "epoch": 1.4606078489229861, "grad_norm": 0.24870793022599877, "learning_rate": 2.8513661202185793e-05, "loss": 0.4816, "num_tokens": 748973760.0, "step": 4950 }, { "epoch": 1.4620832103865447, "grad_norm": 0.25084497121329236, "learning_rate": 2.848633879781421e-05, "loss": 0.4639, "num_tokens": 749702356.0, "step": 4955 }, { "epoch": 1.4635585718501032, "grad_norm": 0.26751320499838616, "learning_rate": 2.8459016393442623e-05, "loss": 0.5007, "num_tokens": 750499654.0, "step": 4960 }, { "epoch": 1.465033933313662, "grad_norm": 0.26341227098620124, "learning_rate": 2.843169398907104e-05, "loss": 0.5022, "num_tokens": 751293716.0, "step": 4965 }, { "epoch": 1.4665092947772205, "grad_norm": 0.25072048270294134, "learning_rate": 2.8404371584699457e-05, "loss": 0.4977, "num_tokens": 752086285.0, "step": 4970 }, { "epoch": 1.467984656240779, "grad_norm": 0.23453191073165408, "learning_rate": 2.837704918032787e-05, "loss": 0.4938, "num_tokens": 752878582.0, "step": 4975 }, { "epoch": 1.4694600177043375, "grad_norm": 0.24669281250977074, "learning_rate": 2.8349726775956287e-05, "loss": 0.4905, "num_tokens": 753616646.0, "step": 4980 }, { "epoch": 1.470935379167896, "grad_norm": 0.260093583619849, "learning_rate": 2.83224043715847e-05, "loss": 0.4835, "num_tokens": 754398994.0, "step": 4985 }, { "epoch": 1.4724107406314548, "grad_norm": 0.24106711291178798, "learning_rate": 2.8295081967213117e-05, "loss": 0.478, "num_tokens": 755190241.0, "step": 4990 }, { "epoch": 1.4738861020950131, "grad_norm": 0.26303186782282056, "learning_rate": 2.8267759562841533e-05, "loss": 0.4906, "num_tokens": 755950839.0, "step": 4995 }, { "epoch": 1.475361463558572, "grad_norm": 0.2474942876624566, "learning_rate": 2.8240437158469947e-05, "loss": 0.4724, "num_tokens": 756706913.0, "step": 5000 }, { "epoch": 1.4768368250221304, "grad_norm": 0.24126392121673762, "learning_rate": 2.8213114754098363e-05, "loss": 0.4917, "num_tokens": 757522921.0, "step": 5005 }, { "epoch": 1.478312186485689, "grad_norm": 0.2425591002802642, "learning_rate": 2.8185792349726773e-05, "loss": 0.4926, "num_tokens": 758285197.0, "step": 5010 }, { "epoch": 1.4797875479492475, "grad_norm": 0.26449422125171596, "learning_rate": 2.8158469945355194e-05, "loss": 0.4578, "num_tokens": 758944264.0, "step": 5015 }, { "epoch": 1.481262909412806, "grad_norm": 0.24911889462991896, "learning_rate": 2.813114754098361e-05, "loss": 0.4921, "num_tokens": 759727561.0, "step": 5020 }, { "epoch": 1.4827382708763648, "grad_norm": 0.24843871670862844, "learning_rate": 2.8103825136612024e-05, "loss": 0.4779, "num_tokens": 760492496.0, "step": 5025 }, { "epoch": 1.4842136323399233, "grad_norm": 0.24896041588134746, "learning_rate": 2.807650273224044e-05, "loss": 0.4833, "num_tokens": 761242374.0, "step": 5030 }, { "epoch": 1.4856889938034818, "grad_norm": 0.26343286768075014, "learning_rate": 2.804918032786885e-05, "loss": 0.4934, "num_tokens": 762078540.0, "step": 5035 }, { "epoch": 1.4871643552670404, "grad_norm": 0.2356062725843235, "learning_rate": 2.802185792349727e-05, "loss": 0.4688, "num_tokens": 762823744.0, "step": 5040 }, { "epoch": 1.488639716730599, "grad_norm": 0.2604603390649696, "learning_rate": 2.7994535519125687e-05, "loss": 0.4746, "num_tokens": 763570059.0, "step": 5045 }, { "epoch": 1.4901150781941577, "grad_norm": 0.24856139031470872, "learning_rate": 2.7967213114754097e-05, "loss": 0.4785, "num_tokens": 764299967.0, "step": 5050 }, { "epoch": 1.4915904396577162, "grad_norm": 0.2938609708669103, "learning_rate": 2.7939890710382517e-05, "loss": 0.4752, "num_tokens": 764994810.0, "step": 5055 }, { "epoch": 1.4930658011212747, "grad_norm": 0.2594952155998531, "learning_rate": 2.7912568306010927e-05, "loss": 0.5028, "num_tokens": 765736623.0, "step": 5060 }, { "epoch": 1.4945411625848333, "grad_norm": 0.2667748952374824, "learning_rate": 2.7885245901639344e-05, "loss": 0.4891, "num_tokens": 766484815.0, "step": 5065 }, { "epoch": 1.4960165240483918, "grad_norm": 0.2577749871968737, "learning_rate": 2.7857923497267764e-05, "loss": 0.5008, "num_tokens": 767210826.0, "step": 5070 }, { "epoch": 1.4974918855119506, "grad_norm": 0.23733676118407954, "learning_rate": 2.7830601092896174e-05, "loss": 0.4899, "num_tokens": 768006585.0, "step": 5075 }, { "epoch": 1.4989672469755089, "grad_norm": 0.23072784315034348, "learning_rate": 2.7803278688524594e-05, "loss": 0.4862, "num_tokens": 768793074.0, "step": 5080 }, { "epoch": 1.5004426084390676, "grad_norm": 0.2559647505818178, "learning_rate": 2.7775956284153004e-05, "loss": 0.4914, "num_tokens": 769551729.0, "step": 5085 }, { "epoch": 1.5019179699026262, "grad_norm": 0.2659627583108194, "learning_rate": 2.774863387978142e-05, "loss": 0.4919, "num_tokens": 770251709.0, "step": 5090 }, { "epoch": 1.5033933313661847, "grad_norm": 0.2505154684913873, "learning_rate": 2.772131147540984e-05, "loss": 0.4923, "num_tokens": 771003141.0, "step": 5095 }, { "epoch": 1.5048686928297434, "grad_norm": 0.253859665295668, "learning_rate": 2.769398907103825e-05, "loss": 0.4993, "num_tokens": 771817130.0, "step": 5100 }, { "epoch": 1.5063440542933018, "grad_norm": 0.25072523143605946, "learning_rate": 2.7666666666666667e-05, "loss": 0.4784, "num_tokens": 772551653.0, "step": 5105 }, { "epoch": 1.5078194157568605, "grad_norm": 0.24098007725828996, "learning_rate": 2.7639344262295087e-05, "loss": 0.4746, "num_tokens": 773314634.0, "step": 5110 }, { "epoch": 1.509294777220419, "grad_norm": 0.2726259986562286, "learning_rate": 2.7612021857923497e-05, "loss": 0.4856, "num_tokens": 774111154.0, "step": 5115 }, { "epoch": 1.5107701386839776, "grad_norm": 0.2523232297427533, "learning_rate": 2.7584699453551914e-05, "loss": 0.4769, "num_tokens": 774835724.0, "step": 5120 }, { "epoch": 1.5122455001475361, "grad_norm": 0.2784673879173644, "learning_rate": 2.7557377049180327e-05, "loss": 0.4888, "num_tokens": 775621302.0, "step": 5125 }, { "epoch": 1.5137208616110946, "grad_norm": 0.2874189010800504, "learning_rate": 2.7530054644808744e-05, "loss": 0.4831, "num_tokens": 776387825.0, "step": 5130 }, { "epoch": 1.5151962230746534, "grad_norm": 0.25738016878766445, "learning_rate": 2.750273224043716e-05, "loss": 0.493, "num_tokens": 777180183.0, "step": 5135 }, { "epoch": 1.5166715845382117, "grad_norm": 0.2446474082871632, "learning_rate": 2.7475409836065574e-05, "loss": 0.4833, "num_tokens": 778009423.0, "step": 5140 }, { "epoch": 1.5181469460017705, "grad_norm": 0.26684473894804206, "learning_rate": 2.744808743169399e-05, "loss": 0.4856, "num_tokens": 778821872.0, "step": 5145 }, { "epoch": 1.519622307465329, "grad_norm": 0.23853462803583303, "learning_rate": 2.7420765027322404e-05, "loss": 0.4636, "num_tokens": 779531792.0, "step": 5150 }, { "epoch": 1.5210976689288875, "grad_norm": 0.2338222983712589, "learning_rate": 2.739344262295082e-05, "loss": 0.4714, "num_tokens": 780280844.0, "step": 5155 }, { "epoch": 1.5225730303924463, "grad_norm": 0.2443032692873293, "learning_rate": 2.7366120218579237e-05, "loss": 0.4947, "num_tokens": 781063403.0, "step": 5160 }, { "epoch": 1.5240483918560046, "grad_norm": 0.26828740378904786, "learning_rate": 2.733879781420765e-05, "loss": 0.481, "num_tokens": 781764861.0, "step": 5165 }, { "epoch": 1.5255237533195634, "grad_norm": 0.25242073586155883, "learning_rate": 2.7311475409836067e-05, "loss": 0.4764, "num_tokens": 782540512.0, "step": 5170 }, { "epoch": 1.526999114783122, "grad_norm": 0.26673837976780296, "learning_rate": 2.728415300546448e-05, "loss": 0.4936, "num_tokens": 783328923.0, "step": 5175 }, { "epoch": 1.5284744762466804, "grad_norm": 0.2716642940023848, "learning_rate": 2.7256830601092898e-05, "loss": 0.485, "num_tokens": 784124851.0, "step": 5180 }, { "epoch": 1.529949837710239, "grad_norm": 0.2664180112333674, "learning_rate": 2.7229508196721314e-05, "loss": 0.4943, "num_tokens": 784881016.0, "step": 5185 }, { "epoch": 1.5314251991737975, "grad_norm": 0.25416784546450405, "learning_rate": 2.7202185792349728e-05, "loss": 0.4993, "num_tokens": 785702513.0, "step": 5190 }, { "epoch": 1.5329005606373562, "grad_norm": 0.2601679554713974, "learning_rate": 2.7174863387978144e-05, "loss": 0.5038, "num_tokens": 786497886.0, "step": 5195 }, { "epoch": 1.5343759221009146, "grad_norm": 0.2578604775956736, "learning_rate": 2.7147540983606558e-05, "loss": 0.4808, "num_tokens": 787248004.0, "step": 5200 }, { "epoch": 1.5358512835644733, "grad_norm": 0.26711044182740706, "learning_rate": 2.7120218579234974e-05, "loss": 0.4641, "num_tokens": 787926236.0, "step": 5205 }, { "epoch": 1.5373266450280318, "grad_norm": 0.24950705323803138, "learning_rate": 2.709289617486339e-05, "loss": 0.475, "num_tokens": 788707802.0, "step": 5210 }, { "epoch": 1.5388020064915904, "grad_norm": 0.2692883893487866, "learning_rate": 2.7065573770491804e-05, "loss": 0.4831, "num_tokens": 789462400.0, "step": 5215 }, { "epoch": 1.5402773679551491, "grad_norm": 0.25743647973075895, "learning_rate": 2.703825136612022e-05, "loss": 0.504, "num_tokens": 790229488.0, "step": 5220 }, { "epoch": 1.5417527294187074, "grad_norm": 0.2575305429712383, "learning_rate": 2.7010928961748634e-05, "loss": 0.4992, "num_tokens": 790967566.0, "step": 5225 }, { "epoch": 1.5432280908822662, "grad_norm": 0.24473728272634954, "learning_rate": 2.698360655737705e-05, "loss": 0.4828, "num_tokens": 791691722.0, "step": 5230 }, { "epoch": 1.5447034523458247, "grad_norm": 0.23545670538440483, "learning_rate": 2.6956284153005468e-05, "loss": 0.4869, "num_tokens": 792523689.0, "step": 5235 }, { "epoch": 1.5461788138093833, "grad_norm": 0.2710043148146718, "learning_rate": 2.692896174863388e-05, "loss": 0.476, "num_tokens": 793236247.0, "step": 5240 }, { "epoch": 1.547654175272942, "grad_norm": 0.24048265010257372, "learning_rate": 2.6901639344262298e-05, "loss": 0.4801, "num_tokens": 794007806.0, "step": 5245 }, { "epoch": 1.5491295367365003, "grad_norm": 0.23997618313003646, "learning_rate": 2.6874316939890708e-05, "loss": 0.4702, "num_tokens": 794725443.0, "step": 5250 }, { "epoch": 1.550604898200059, "grad_norm": 0.27120474471930084, "learning_rate": 2.6846994535519128e-05, "loss": 0.4931, "num_tokens": 795524560.0, "step": 5255 }, { "epoch": 1.5520802596636176, "grad_norm": 0.2444429076551355, "learning_rate": 2.6819672131147545e-05, "loss": 0.4848, "num_tokens": 796252905.0, "step": 5260 }, { "epoch": 1.5535556211271762, "grad_norm": 0.2686396417479986, "learning_rate": 2.6792349726775954e-05, "loss": 0.4841, "num_tokens": 796925708.0, "step": 5265 }, { "epoch": 1.5550309825907347, "grad_norm": 0.3077231197957917, "learning_rate": 2.6765027322404375e-05, "loss": 0.4944, "num_tokens": 797637165.0, "step": 5270 }, { "epoch": 1.5565063440542932, "grad_norm": 0.24612276711445885, "learning_rate": 2.6737704918032784e-05, "loss": 0.4897, "num_tokens": 798415849.0, "step": 5275 }, { "epoch": 1.557981705517852, "grad_norm": 0.26486556969728203, "learning_rate": 2.6710382513661205e-05, "loss": 0.4615, "num_tokens": 799127054.0, "step": 5280 }, { "epoch": 1.5594570669814103, "grad_norm": 0.24827893678737203, "learning_rate": 2.668306010928962e-05, "loss": 0.4785, "num_tokens": 799866620.0, "step": 5285 }, { "epoch": 1.560932428444969, "grad_norm": 0.2384990118750994, "learning_rate": 2.665573770491803e-05, "loss": 0.4837, "num_tokens": 800668776.0, "step": 5290 }, { "epoch": 1.5624077899085276, "grad_norm": 0.2507970569192269, "learning_rate": 2.662841530054645e-05, "loss": 0.4804, "num_tokens": 801404225.0, "step": 5295 }, { "epoch": 1.5638831513720861, "grad_norm": 0.27317262624234023, "learning_rate": 2.6601092896174868e-05, "loss": 0.4948, "num_tokens": 802161455.0, "step": 5300 }, { "epoch": 1.5653585128356449, "grad_norm": 0.2625353031756185, "learning_rate": 2.6573770491803278e-05, "loss": 0.5, "num_tokens": 802921842.0, "step": 5305 }, { "epoch": 1.5668338742992032, "grad_norm": 0.26737146857697786, "learning_rate": 2.6546448087431698e-05, "loss": 0.4876, "num_tokens": 803679031.0, "step": 5310 }, { "epoch": 1.568309235762762, "grad_norm": 0.24669721941789766, "learning_rate": 2.6519125683060108e-05, "loss": 0.48, "num_tokens": 804420454.0, "step": 5315 }, { "epoch": 1.5697845972263205, "grad_norm": 0.2374187049267533, "learning_rate": 2.6491803278688525e-05, "loss": 0.4953, "num_tokens": 805237631.0, "step": 5320 }, { "epoch": 1.571259958689879, "grad_norm": 0.26151810376873014, "learning_rate": 2.6464480874316945e-05, "loss": 0.4776, "num_tokens": 806046175.0, "step": 5325 }, { "epoch": 1.5727353201534378, "grad_norm": 0.22877561446405298, "learning_rate": 2.6437158469945355e-05, "loss": 0.4838, "num_tokens": 806786652.0, "step": 5330 }, { "epoch": 1.574210681616996, "grad_norm": 0.2775842292165126, "learning_rate": 2.640983606557377e-05, "loss": 0.4768, "num_tokens": 807457848.0, "step": 5335 }, { "epoch": 1.5756860430805548, "grad_norm": 0.25774766980284175, "learning_rate": 2.6382513661202185e-05, "loss": 0.4487, "num_tokens": 808151954.0, "step": 5340 }, { "epoch": 1.5771614045441134, "grad_norm": 0.26184869163327307, "learning_rate": 2.63551912568306e-05, "loss": 0.4886, "num_tokens": 808932783.0, "step": 5345 }, { "epoch": 1.5786367660076719, "grad_norm": 0.22861505577330643, "learning_rate": 2.632786885245902e-05, "loss": 0.4895, "num_tokens": 809694956.0, "step": 5350 }, { "epoch": 1.5801121274712304, "grad_norm": 0.3170526046484242, "learning_rate": 2.630054644808743e-05, "loss": 0.4917, "num_tokens": 810455205.0, "step": 5355 }, { "epoch": 1.581587488934789, "grad_norm": 0.2519888954294694, "learning_rate": 2.6273224043715848e-05, "loss": 0.4744, "num_tokens": 811273776.0, "step": 5360 }, { "epoch": 1.5830628503983477, "grad_norm": 0.25397481746625644, "learning_rate": 2.624590163934426e-05, "loss": 0.4833, "num_tokens": 812045709.0, "step": 5365 }, { "epoch": 1.584538211861906, "grad_norm": 0.23873190066388814, "learning_rate": 2.6218579234972678e-05, "loss": 0.4949, "num_tokens": 812805457.0, "step": 5370 }, { "epoch": 1.5860135733254648, "grad_norm": 0.24206238393757637, "learning_rate": 2.6191256830601095e-05, "loss": 0.4652, "num_tokens": 813561317.0, "step": 5375 }, { "epoch": 1.5874889347890233, "grad_norm": 0.306772797943644, "learning_rate": 2.6163934426229508e-05, "loss": 0.4612, "num_tokens": 814257533.0, "step": 5380 }, { "epoch": 1.5889642962525818, "grad_norm": 0.2463168470052452, "learning_rate": 2.6136612021857925e-05, "loss": 0.4795, "num_tokens": 815044835.0, "step": 5385 }, { "epoch": 1.5904396577161406, "grad_norm": 0.2566323194533926, "learning_rate": 2.610928961748634e-05, "loss": 0.5047, "num_tokens": 815857858.0, "step": 5390 }, { "epoch": 1.591915019179699, "grad_norm": 0.24333332727718038, "learning_rate": 2.6081967213114755e-05, "loss": 0.5048, "num_tokens": 816654948.0, "step": 5395 }, { "epoch": 1.5933903806432577, "grad_norm": 0.2471163916475109, "learning_rate": 2.6054644808743172e-05, "loss": 0.5049, "num_tokens": 817436897.0, "step": 5400 }, { "epoch": 1.5948657421068162, "grad_norm": 0.2553094059920526, "learning_rate": 2.6027322404371585e-05, "loss": 0.4904, "num_tokens": 818177153.0, "step": 5405 }, { "epoch": 1.5963411035703747, "grad_norm": 0.24675370816497003, "learning_rate": 2.6000000000000002e-05, "loss": 0.4817, "num_tokens": 818998799.0, "step": 5410 }, { "epoch": 1.5978164650339333, "grad_norm": 0.2480720114510296, "learning_rate": 2.5972677595628415e-05, "loss": 0.5032, "num_tokens": 819765065.0, "step": 5415 }, { "epoch": 1.5992918264974918, "grad_norm": 0.24522875129545468, "learning_rate": 2.5945355191256832e-05, "loss": 0.4928, "num_tokens": 820520758.0, "step": 5420 }, { "epoch": 1.6007671879610506, "grad_norm": 0.2599390671016569, "learning_rate": 2.591803278688525e-05, "loss": 0.4704, "num_tokens": 821254757.0, "step": 5425 }, { "epoch": 1.6022425494246089, "grad_norm": 0.24467902411569262, "learning_rate": 2.5890710382513662e-05, "loss": 0.4758, "num_tokens": 822020081.0, "step": 5430 }, { "epoch": 1.6037179108881676, "grad_norm": 0.23965811835350245, "learning_rate": 2.586338797814208e-05, "loss": 0.4848, "num_tokens": 822819512.0, "step": 5435 }, { "epoch": 1.6051932723517262, "grad_norm": 0.2665907638598529, "learning_rate": 2.5836065573770492e-05, "loss": 0.4964, "num_tokens": 823566193.0, "step": 5440 }, { "epoch": 1.6066686338152847, "grad_norm": 0.2483690578695113, "learning_rate": 2.580874316939891e-05, "loss": 0.4819, "num_tokens": 824342287.0, "step": 5445 }, { "epoch": 1.6081439952788434, "grad_norm": 0.28419401812508066, "learning_rate": 2.5781420765027325e-05, "loss": 0.4918, "num_tokens": 825101911.0, "step": 5450 }, { "epoch": 1.6096193567424018, "grad_norm": 0.25697375183887655, "learning_rate": 2.575409836065574e-05, "loss": 0.4679, "num_tokens": 825841710.0, "step": 5455 }, { "epoch": 1.6110947182059605, "grad_norm": 0.2493136789256838, "learning_rate": 2.5726775956284155e-05, "loss": 0.4699, "num_tokens": 826566994.0, "step": 5460 }, { "epoch": 1.612570079669519, "grad_norm": 0.24968202383321214, "learning_rate": 2.5699453551912572e-05, "loss": 0.4951, "num_tokens": 827379455.0, "step": 5465 }, { "epoch": 1.6140454411330776, "grad_norm": 0.2600539419809177, "learning_rate": 2.5672131147540985e-05, "loss": 0.4816, "num_tokens": 828104956.0, "step": 5470 }, { "epoch": 1.6155208025966363, "grad_norm": 0.27050859013083767, "learning_rate": 2.5644808743169402e-05, "loss": 0.482, "num_tokens": 828854982.0, "step": 5475 }, { "epoch": 1.6169961640601946, "grad_norm": 0.25804494410983353, "learning_rate": 2.5617486338797812e-05, "loss": 0.4826, "num_tokens": 829615805.0, "step": 5480 }, { "epoch": 1.6184715255237534, "grad_norm": 0.24878414961647066, "learning_rate": 2.5590163934426232e-05, "loss": 0.4725, "num_tokens": 830330972.0, "step": 5485 }, { "epoch": 1.619946886987312, "grad_norm": 0.26296417707879616, "learning_rate": 2.556284153005465e-05, "loss": 0.4862, "num_tokens": 831123253.0, "step": 5490 }, { "epoch": 1.6214222484508705, "grad_norm": 0.2496456649712489, "learning_rate": 2.5535519125683062e-05, "loss": 0.4805, "num_tokens": 831916427.0, "step": 5495 }, { "epoch": 1.622897609914429, "grad_norm": 0.24840882850576018, "learning_rate": 2.550819672131148e-05, "loss": 0.4842, "num_tokens": 832712967.0, "step": 5500 }, { "epoch": 1.6243729713779875, "grad_norm": 0.25221942382274437, "learning_rate": 2.548087431693989e-05, "loss": 0.4771, "num_tokens": 833412231.0, "step": 5505 }, { "epoch": 1.6258483328415463, "grad_norm": 0.2564924807191284, "learning_rate": 2.545355191256831e-05, "loss": 0.4749, "num_tokens": 834145242.0, "step": 5510 }, { "epoch": 1.6273236943051046, "grad_norm": 0.25259395498488235, "learning_rate": 2.5426229508196726e-05, "loss": 0.4953, "num_tokens": 834922266.0, "step": 5515 }, { "epoch": 1.6287990557686634, "grad_norm": 0.25877421341106244, "learning_rate": 2.5398907103825136e-05, "loss": 0.4647, "num_tokens": 835591805.0, "step": 5520 }, { "epoch": 1.6302744172322219, "grad_norm": 0.2545539715402776, "learning_rate": 2.5371584699453556e-05, "loss": 0.4747, "num_tokens": 836337946.0, "step": 5525 }, { "epoch": 1.6317497786957804, "grad_norm": 0.24740166648439904, "learning_rate": 2.5344262295081966e-05, "loss": 0.4695, "num_tokens": 837092736.0, "step": 5530 }, { "epoch": 1.6332251401593392, "grad_norm": 0.2611456367857681, "learning_rate": 2.5316939890710382e-05, "loss": 0.4778, "num_tokens": 837826327.0, "step": 5535 }, { "epoch": 1.6347005016228975, "grad_norm": 0.2547792916716967, "learning_rate": 2.5289617486338802e-05, "loss": 0.4911, "num_tokens": 838576035.0, "step": 5540 }, { "epoch": 1.6361758630864562, "grad_norm": 0.27582816552098577, "learning_rate": 2.5262295081967212e-05, "loss": 0.4914, "num_tokens": 839302859.0, "step": 5545 }, { "epoch": 1.6376512245500148, "grad_norm": 0.26001758912904904, "learning_rate": 2.5234972677595632e-05, "loss": 0.4846, "num_tokens": 840063267.0, "step": 5550 }, { "epoch": 1.6391265860135733, "grad_norm": 0.27356158673483694, "learning_rate": 2.5207650273224042e-05, "loss": 0.467, "num_tokens": 840753186.0, "step": 5555 }, { "epoch": 1.640601947477132, "grad_norm": 0.2447905970474178, "learning_rate": 2.518032786885246e-05, "loss": 0.4872, "num_tokens": 841517382.0, "step": 5560 }, { "epoch": 1.6420773089406904, "grad_norm": 0.2513817957370534, "learning_rate": 2.515300546448088e-05, "loss": 0.4893, "num_tokens": 842267493.0, "step": 5565 }, { "epoch": 1.6435526704042491, "grad_norm": 0.27189997361137913, "learning_rate": 2.512568306010929e-05, "loss": 0.4685, "num_tokens": 842948858.0, "step": 5570 }, { "epoch": 1.6450280318678077, "grad_norm": 0.26521023955636636, "learning_rate": 2.5098360655737706e-05, "loss": 0.4852, "num_tokens": 843689933.0, "step": 5575 }, { "epoch": 1.6465033933313662, "grad_norm": 0.2547490711440282, "learning_rate": 2.507103825136612e-05, "loss": 0.4852, "num_tokens": 844484867.0, "step": 5580 }, { "epoch": 1.6479787547949247, "grad_norm": 0.26546299151865654, "learning_rate": 2.5043715846994536e-05, "loss": 0.4933, "num_tokens": 845224054.0, "step": 5585 }, { "epoch": 1.6494541162584833, "grad_norm": 0.267288023467938, "learning_rate": 2.5016393442622953e-05, "loss": 0.4897, "num_tokens": 845967478.0, "step": 5590 }, { "epoch": 1.650929477722042, "grad_norm": 0.24205939846025978, "learning_rate": 2.4989071038251366e-05, "loss": 0.4617, "num_tokens": 846706599.0, "step": 5595 }, { "epoch": 1.6524048391856003, "grad_norm": 0.23086801536360954, "learning_rate": 2.4961748633879783e-05, "loss": 0.4718, "num_tokens": 847458283.0, "step": 5600 }, { "epoch": 1.653880200649159, "grad_norm": 0.24490947243254893, "learning_rate": 2.49344262295082e-05, "loss": 0.4825, "num_tokens": 848242688.0, "step": 5605 }, { "epoch": 1.6553555621127176, "grad_norm": 0.23700765422074618, "learning_rate": 2.4907103825136613e-05, "loss": 0.4945, "num_tokens": 849090611.0, "step": 5610 }, { "epoch": 1.6568309235762761, "grad_norm": 0.24730863201519265, "learning_rate": 2.4879781420765026e-05, "loss": 0.4719, "num_tokens": 849813047.0, "step": 5615 }, { "epoch": 1.658306285039835, "grad_norm": 0.23308406480280747, "learning_rate": 2.4852459016393446e-05, "loss": 0.4675, "num_tokens": 850507704.0, "step": 5620 }, { "epoch": 1.6597816465033932, "grad_norm": 0.23551789548917754, "learning_rate": 2.482513661202186e-05, "loss": 0.4868, "num_tokens": 851272415.0, "step": 5625 }, { "epoch": 1.661257007966952, "grad_norm": 0.22535698642652863, "learning_rate": 2.4797814207650276e-05, "loss": 0.4814, "num_tokens": 852022874.0, "step": 5630 }, { "epoch": 1.6627323694305105, "grad_norm": 0.2555721752049558, "learning_rate": 2.477049180327869e-05, "loss": 0.4841, "num_tokens": 852779079.0, "step": 5635 }, { "epoch": 1.664207730894069, "grad_norm": 0.2623834782209739, "learning_rate": 2.4743169398907103e-05, "loss": 0.4745, "num_tokens": 853492971.0, "step": 5640 }, { "epoch": 1.6656830923576276, "grad_norm": 0.25841218639518193, "learning_rate": 2.4715846994535523e-05, "loss": 0.4845, "num_tokens": 854284412.0, "step": 5645 }, { "epoch": 1.667158453821186, "grad_norm": 0.23797912298980278, "learning_rate": 2.4688524590163936e-05, "loss": 0.4841, "num_tokens": 855006096.0, "step": 5650 }, { "epoch": 1.6686338152847449, "grad_norm": 0.24480003818649942, "learning_rate": 2.466120218579235e-05, "loss": 0.4737, "num_tokens": 855744473.0, "step": 5655 }, { "epoch": 1.6701091767483032, "grad_norm": 0.25407833392993096, "learning_rate": 2.4633879781420766e-05, "loss": 0.4733, "num_tokens": 856470387.0, "step": 5660 }, { "epoch": 1.671584538211862, "grad_norm": 0.26284262063100694, "learning_rate": 2.460655737704918e-05, "loss": 0.4792, "num_tokens": 857213150.0, "step": 5665 }, { "epoch": 1.6730598996754205, "grad_norm": 0.2416063493828959, "learning_rate": 2.4579234972677596e-05, "loss": 0.4762, "num_tokens": 857925012.0, "step": 5670 }, { "epoch": 1.674535261138979, "grad_norm": 0.22821181409746424, "learning_rate": 2.4551912568306013e-05, "loss": 0.4803, "num_tokens": 858687104.0, "step": 5675 }, { "epoch": 1.6760106226025377, "grad_norm": 0.2504183085299953, "learning_rate": 2.4524590163934426e-05, "loss": 0.485, "num_tokens": 859439342.0, "step": 5680 }, { "epoch": 1.677485984066096, "grad_norm": 0.23490398902429857, "learning_rate": 2.4497267759562843e-05, "loss": 0.4986, "num_tokens": 860208961.0, "step": 5685 }, { "epoch": 1.6789613455296548, "grad_norm": 0.25125800808011917, "learning_rate": 2.4469945355191256e-05, "loss": 0.4795, "num_tokens": 860949576.0, "step": 5690 }, { "epoch": 1.6804367069932133, "grad_norm": 0.2263841956411643, "learning_rate": 2.4442622950819673e-05, "loss": 0.472, "num_tokens": 861702806.0, "step": 5695 }, { "epoch": 1.6819120684567719, "grad_norm": 0.2354048344609682, "learning_rate": 2.441530054644809e-05, "loss": 0.4973, "num_tokens": 862495846.0, "step": 5700 }, { "epoch": 1.6833874299203306, "grad_norm": 0.28025097910937236, "learning_rate": 2.4387978142076503e-05, "loss": 0.4929, "num_tokens": 863225652.0, "step": 5705 }, { "epoch": 1.684862791383889, "grad_norm": 0.24364886353948384, "learning_rate": 2.436065573770492e-05, "loss": 0.4873, "num_tokens": 863981384.0, "step": 5710 }, { "epoch": 1.6863381528474477, "grad_norm": 0.25718843883567316, "learning_rate": 2.4333333333333336e-05, "loss": 0.495, "num_tokens": 864751611.0, "step": 5715 }, { "epoch": 1.6878135143110062, "grad_norm": 0.25093859066890634, "learning_rate": 2.430601092896175e-05, "loss": 0.4704, "num_tokens": 865457518.0, "step": 5720 }, { "epoch": 1.6892888757745648, "grad_norm": 0.24768300641105373, "learning_rate": 2.4278688524590166e-05, "loss": 0.4837, "num_tokens": 866199423.0, "step": 5725 }, { "epoch": 1.6907642372381233, "grad_norm": 0.26954716776595583, "learning_rate": 2.425136612021858e-05, "loss": 0.4763, "num_tokens": 866920605.0, "step": 5730 }, { "epoch": 1.6922395987016818, "grad_norm": 0.26445108551809393, "learning_rate": 2.4224043715846993e-05, "loss": 0.4919, "num_tokens": 867677402.0, "step": 5735 }, { "epoch": 1.6937149601652406, "grad_norm": 0.2587114997776466, "learning_rate": 2.4196721311475413e-05, "loss": 0.4769, "num_tokens": 868377843.0, "step": 5740 }, { "epoch": 1.695190321628799, "grad_norm": 0.24185493968477004, "learning_rate": 2.4169398907103826e-05, "loss": 0.4825, "num_tokens": 869138586.0, "step": 5745 }, { "epoch": 1.6966656830923577, "grad_norm": 0.2275655618331662, "learning_rate": 2.4142076502732243e-05, "loss": 0.4833, "num_tokens": 869913729.0, "step": 5750 }, { "epoch": 1.6981410445559162, "grad_norm": 0.24100069201069327, "learning_rate": 2.4114754098360657e-05, "loss": 0.461, "num_tokens": 870666062.0, "step": 5755 }, { "epoch": 1.6996164060194747, "grad_norm": 0.24052035467192387, "learning_rate": 2.408743169398907e-05, "loss": 0.4796, "num_tokens": 871417425.0, "step": 5760 }, { "epoch": 1.7010917674830335, "grad_norm": 0.23615350292910461, "learning_rate": 2.406010928961749e-05, "loss": 0.4934, "num_tokens": 872225039.0, "step": 5765 }, { "epoch": 1.7025671289465918, "grad_norm": 0.2926544173672724, "learning_rate": 2.4032786885245903e-05, "loss": 0.4837, "num_tokens": 872957407.0, "step": 5770 }, { "epoch": 1.7040424904101505, "grad_norm": 0.2435643259682938, "learning_rate": 2.4005464480874317e-05, "loss": 0.4925, "num_tokens": 873749333.0, "step": 5775 }, { "epoch": 1.705517851873709, "grad_norm": 0.2504463327402888, "learning_rate": 2.3978142076502733e-05, "loss": 0.4758, "num_tokens": 874473950.0, "step": 5780 }, { "epoch": 1.7069932133372676, "grad_norm": 0.2613860942453885, "learning_rate": 2.3950819672131147e-05, "loss": 0.485, "num_tokens": 875229114.0, "step": 5785 }, { "epoch": 1.7084685748008264, "grad_norm": 0.2502732823794517, "learning_rate": 2.3923497267759563e-05, "loss": 0.4882, "num_tokens": 876014217.0, "step": 5790 }, { "epoch": 1.7099439362643847, "grad_norm": 0.2617273838397416, "learning_rate": 2.389617486338798e-05, "loss": 0.4877, "num_tokens": 876744761.0, "step": 5795 }, { "epoch": 1.7114192977279434, "grad_norm": 0.22397273968659473, "learning_rate": 2.3868852459016393e-05, "loss": 0.4858, "num_tokens": 877544978.0, "step": 5800 }, { "epoch": 1.712894659191502, "grad_norm": 0.2367773720209675, "learning_rate": 2.384153005464481e-05, "loss": 0.4794, "num_tokens": 878298146.0, "step": 5805 }, { "epoch": 1.7143700206550605, "grad_norm": 0.25170512234523734, "learning_rate": 2.3814207650273227e-05, "loss": 0.4825, "num_tokens": 879046967.0, "step": 5810 }, { "epoch": 1.715845382118619, "grad_norm": 0.23558780657277112, "learning_rate": 2.378688524590164e-05, "loss": 0.4882, "num_tokens": 879882059.0, "step": 5815 }, { "epoch": 1.7173207435821776, "grad_norm": 0.2473401982634158, "learning_rate": 2.3759562841530057e-05, "loss": 0.4698, "num_tokens": 880599085.0, "step": 5820 }, { "epoch": 1.7187961050457363, "grad_norm": 0.2534148823843184, "learning_rate": 2.373224043715847e-05, "loss": 0.47, "num_tokens": 881294044.0, "step": 5825 }, { "epoch": 1.7202714665092946, "grad_norm": 0.25927969893767366, "learning_rate": 2.3704918032786887e-05, "loss": 0.477, "num_tokens": 882000451.0, "step": 5830 }, { "epoch": 1.7217468279728534, "grad_norm": 0.2590073014944887, "learning_rate": 2.3677595628415304e-05, "loss": 0.4777, "num_tokens": 882698477.0, "step": 5835 }, { "epoch": 1.723222189436412, "grad_norm": 0.234185171191807, "learning_rate": 2.3650273224043717e-05, "loss": 0.4702, "num_tokens": 883470644.0, "step": 5840 }, { "epoch": 1.7246975508999705, "grad_norm": 0.23297369182586042, "learning_rate": 2.3622950819672134e-05, "loss": 0.4769, "num_tokens": 884210520.0, "step": 5845 }, { "epoch": 1.7261729123635292, "grad_norm": 0.2645432933565354, "learning_rate": 2.3595628415300547e-05, "loss": 0.4855, "num_tokens": 884891418.0, "step": 5850 }, { "epoch": 1.7276482738270875, "grad_norm": 0.2570451270981347, "learning_rate": 2.356830601092896e-05, "loss": 0.4872, "num_tokens": 885671418.0, "step": 5855 }, { "epoch": 1.7291236352906463, "grad_norm": 0.2637812033430716, "learning_rate": 2.354098360655738e-05, "loss": 0.4955, "num_tokens": 886420504.0, "step": 5860 }, { "epoch": 1.7305989967542048, "grad_norm": 0.2648266130716111, "learning_rate": 2.3513661202185794e-05, "loss": 0.4719, "num_tokens": 887146674.0, "step": 5865 }, { "epoch": 1.7320743582177633, "grad_norm": 0.2380484205060175, "learning_rate": 2.3486338797814207e-05, "loss": 0.4769, "num_tokens": 887923679.0, "step": 5870 }, { "epoch": 1.7335497196813219, "grad_norm": 0.24440897431332506, "learning_rate": 2.3459016393442624e-05, "loss": 0.4632, "num_tokens": 888673090.0, "step": 5875 }, { "epoch": 1.7350250811448804, "grad_norm": 0.24879603121343544, "learning_rate": 2.343169398907104e-05, "loss": 0.4564, "num_tokens": 889404261.0, "step": 5880 }, { "epoch": 1.7365004426084392, "grad_norm": 0.24721152041516953, "learning_rate": 2.3404371584699457e-05, "loss": 0.4837, "num_tokens": 890183083.0, "step": 5885 }, { "epoch": 1.7379758040719975, "grad_norm": 0.24383678140673515, "learning_rate": 2.337704918032787e-05, "loss": 0.4803, "num_tokens": 890953248.0, "step": 5890 }, { "epoch": 1.7394511655355562, "grad_norm": 0.24254774691047623, "learning_rate": 2.3349726775956284e-05, "loss": 0.466, "num_tokens": 891664918.0, "step": 5895 }, { "epoch": 1.7409265269991148, "grad_norm": 0.253116771626386, "learning_rate": 2.33224043715847e-05, "loss": 0.4978, "num_tokens": 892462877.0, "step": 5900 }, { "epoch": 1.7424018884626733, "grad_norm": 0.2472861650374839, "learning_rate": 2.3295081967213117e-05, "loss": 0.475, "num_tokens": 893211031.0, "step": 5905 }, { "epoch": 1.743877249926232, "grad_norm": 0.2492191227894404, "learning_rate": 2.326775956284153e-05, "loss": 0.4846, "num_tokens": 893981073.0, "step": 5910 }, { "epoch": 1.7453526113897904, "grad_norm": 0.22990170705094268, "learning_rate": 2.3240437158469947e-05, "loss": 0.4695, "num_tokens": 894786314.0, "step": 5915 }, { "epoch": 1.7468279728533491, "grad_norm": 0.2846785269271698, "learning_rate": 2.321311475409836e-05, "loss": 0.4814, "num_tokens": 895589769.0, "step": 5920 }, { "epoch": 1.7483033343169077, "grad_norm": 0.2495391952172782, "learning_rate": 2.3185792349726777e-05, "loss": 0.4663, "num_tokens": 896337966.0, "step": 5925 }, { "epoch": 1.7497786957804662, "grad_norm": 0.25226958235618113, "learning_rate": 2.3158469945355194e-05, "loss": 0.4817, "num_tokens": 897113425.0, "step": 5930 }, { "epoch": 1.751254057244025, "grad_norm": 0.2556459307776369, "learning_rate": 2.3131147540983607e-05, "loss": 0.454, "num_tokens": 897858519.0, "step": 5935 }, { "epoch": 1.7527294187075833, "grad_norm": 0.2408701195515598, "learning_rate": 2.3103825136612024e-05, "loss": 0.4866, "num_tokens": 898630992.0, "step": 5940 }, { "epoch": 1.754204780171142, "grad_norm": 0.26431140260502733, "learning_rate": 2.3076502732240437e-05, "loss": 0.4875, "num_tokens": 899418608.0, "step": 5945 }, { "epoch": 1.7556801416347005, "grad_norm": 0.23624108458088527, "learning_rate": 2.304918032786885e-05, "loss": 0.4784, "num_tokens": 900174384.0, "step": 5950 }, { "epoch": 1.757155503098259, "grad_norm": 0.2643389948603392, "learning_rate": 2.302185792349727e-05, "loss": 0.4806, "num_tokens": 900903598.0, "step": 5955 }, { "epoch": 1.7586308645618176, "grad_norm": 0.24247628253140155, "learning_rate": 2.2994535519125684e-05, "loss": 0.4863, "num_tokens": 901711248.0, "step": 5960 }, { "epoch": 1.7601062260253761, "grad_norm": 0.2518234785378302, "learning_rate": 2.29672131147541e-05, "loss": 0.4932, "num_tokens": 902511896.0, "step": 5965 }, { "epoch": 1.761581587488935, "grad_norm": 0.255913681989029, "learning_rate": 2.2939890710382514e-05, "loss": 0.4821, "num_tokens": 903262405.0, "step": 5970 }, { "epoch": 1.7630569489524932, "grad_norm": 0.28342447950695165, "learning_rate": 2.291256830601093e-05, "loss": 0.4777, "num_tokens": 904009225.0, "step": 5975 }, { "epoch": 1.764532310416052, "grad_norm": 0.24808846753388944, "learning_rate": 2.2885245901639347e-05, "loss": 0.4871, "num_tokens": 904733532.0, "step": 5980 }, { "epoch": 1.7660076718796105, "grad_norm": 0.2413957827332568, "learning_rate": 2.285792349726776e-05, "loss": 0.4785, "num_tokens": 905524065.0, "step": 5985 }, { "epoch": 1.767483033343169, "grad_norm": 0.25067650592850765, "learning_rate": 2.2830601092896174e-05, "loss": 0.4865, "num_tokens": 906284859.0, "step": 5990 }, { "epoch": 1.7689583948067278, "grad_norm": 0.2704231415323106, "learning_rate": 2.280327868852459e-05, "loss": 0.4717, "num_tokens": 907000168.0, "step": 5995 }, { "epoch": 1.770433756270286, "grad_norm": 0.25313013470806156, "learning_rate": 2.2775956284153008e-05, "loss": 0.4757, "num_tokens": 907765514.0, "step": 6000 }, { "epoch": 1.7719091177338449, "grad_norm": 0.24301127238255416, "learning_rate": 2.274863387978142e-05, "loss": 0.4682, "num_tokens": 908501879.0, "step": 6005 }, { "epoch": 1.7733844791974034, "grad_norm": 0.2438958983490277, "learning_rate": 2.2721311475409838e-05, "loss": 0.4733, "num_tokens": 909188147.0, "step": 6010 }, { "epoch": 1.774859840660962, "grad_norm": 0.2420596978607123, "learning_rate": 2.269398907103825e-05, "loss": 0.4823, "num_tokens": 909924706.0, "step": 6015 }, { "epoch": 1.7763352021245207, "grad_norm": 0.22547873877910923, "learning_rate": 2.2666666666666668e-05, "loss": 0.4979, "num_tokens": 910714540.0, "step": 6020 }, { "epoch": 1.777810563588079, "grad_norm": 0.25511022214925627, "learning_rate": 2.2639344262295084e-05, "loss": 0.4808, "num_tokens": 911477380.0, "step": 6025 }, { "epoch": 1.7792859250516377, "grad_norm": 0.26036777575669073, "learning_rate": 2.2612021857923498e-05, "loss": 0.4763, "num_tokens": 912272875.0, "step": 6030 }, { "epoch": 1.7807612865151963, "grad_norm": 0.23890514444632097, "learning_rate": 2.2584699453551914e-05, "loss": 0.4673, "num_tokens": 913045566.0, "step": 6035 }, { "epoch": 1.7822366479787548, "grad_norm": 0.2408052774705555, "learning_rate": 2.2557377049180328e-05, "loss": 0.493, "num_tokens": 913838833.0, "step": 6040 }, { "epoch": 1.7837120094423133, "grad_norm": 0.24096380615264568, "learning_rate": 2.2530054644808744e-05, "loss": 0.4828, "num_tokens": 914613603.0, "step": 6045 }, { "epoch": 1.7851873709058719, "grad_norm": 0.25621497247005304, "learning_rate": 2.250273224043716e-05, "loss": 0.4667, "num_tokens": 915371755.0, "step": 6050 }, { "epoch": 1.7866627323694306, "grad_norm": 0.24013172806653543, "learning_rate": 2.2475409836065574e-05, "loss": 0.4828, "num_tokens": 916157159.0, "step": 6055 }, { "epoch": 1.788138093832989, "grad_norm": 0.2466385064919165, "learning_rate": 2.244808743169399e-05, "loss": 0.476, "num_tokens": 916928458.0, "step": 6060 }, { "epoch": 1.7896134552965477, "grad_norm": 0.25285275720238637, "learning_rate": 2.2420765027322404e-05, "loss": 0.47, "num_tokens": 917637304.0, "step": 6065 }, { "epoch": 1.7910888167601062, "grad_norm": 0.23181467745732334, "learning_rate": 2.239344262295082e-05, "loss": 0.477, "num_tokens": 918441700.0, "step": 6070 }, { "epoch": 1.7925641782236648, "grad_norm": 0.2261619651352211, "learning_rate": 2.2366120218579238e-05, "loss": 0.4719, "num_tokens": 919215971.0, "step": 6075 }, { "epoch": 1.7940395396872235, "grad_norm": 0.24988553442032072, "learning_rate": 2.233879781420765e-05, "loss": 0.4797, "num_tokens": 920002233.0, "step": 6080 }, { "epoch": 1.7955149011507818, "grad_norm": 0.2524130066547855, "learning_rate": 2.2311475409836065e-05, "loss": 0.4994, "num_tokens": 920799727.0, "step": 6085 }, { "epoch": 1.7969902626143406, "grad_norm": 0.22401172231285657, "learning_rate": 2.228415300546448e-05, "loss": 0.4931, "num_tokens": 921612608.0, "step": 6090 }, { "epoch": 1.7984656240778991, "grad_norm": 0.29577203152938164, "learning_rate": 2.2256830601092898e-05, "loss": 0.4846, "num_tokens": 922315832.0, "step": 6095 }, { "epoch": 1.7999409855414576, "grad_norm": 0.25065273609486644, "learning_rate": 2.2229508196721315e-05, "loss": 0.4858, "num_tokens": 923108054.0, "step": 6100 }, { "epoch": 1.8014163470050162, "grad_norm": 0.2503032349122977, "learning_rate": 2.2202185792349728e-05, "loss": 0.4756, "num_tokens": 923836645.0, "step": 6105 }, { "epoch": 1.8028917084685747, "grad_norm": 0.2676931520291393, "learning_rate": 2.217486338797814e-05, "loss": 0.4698, "num_tokens": 924607377.0, "step": 6110 }, { "epoch": 1.8043670699321335, "grad_norm": 0.2330188691275677, "learning_rate": 2.2147540983606558e-05, "loss": 0.457, "num_tokens": 925367216.0, "step": 6115 }, { "epoch": 1.8058424313956918, "grad_norm": 0.25066967692666803, "learning_rate": 2.2120218579234975e-05, "loss": 0.4697, "num_tokens": 926074882.0, "step": 6120 }, { "epoch": 1.8073177928592505, "grad_norm": 0.2502784812921067, "learning_rate": 2.2092896174863388e-05, "loss": 0.4884, "num_tokens": 926829544.0, "step": 6125 }, { "epoch": 1.808793154322809, "grad_norm": 0.2975147614821617, "learning_rate": 2.2065573770491805e-05, "loss": 0.472, "num_tokens": 927493283.0, "step": 6130 }, { "epoch": 1.8102685157863676, "grad_norm": 0.24263394208674932, "learning_rate": 2.2038251366120218e-05, "loss": 0.4698, "num_tokens": 928250151.0, "step": 6135 }, { "epoch": 1.8117438772499264, "grad_norm": 0.27091546543199496, "learning_rate": 2.2010928961748635e-05, "loss": 0.4755, "num_tokens": 928929941.0, "step": 6140 }, { "epoch": 1.8132192387134847, "grad_norm": 0.23761269359393047, "learning_rate": 2.198360655737705e-05, "loss": 0.4827, "num_tokens": 929728411.0, "step": 6145 }, { "epoch": 1.8146946001770434, "grad_norm": 0.262899216922802, "learning_rate": 2.1956284153005465e-05, "loss": 0.4691, "num_tokens": 930494297.0, "step": 6150 }, { "epoch": 1.816169961640602, "grad_norm": 0.25186771687467396, "learning_rate": 2.192896174863388e-05, "loss": 0.4799, "num_tokens": 931273334.0, "step": 6155 }, { "epoch": 1.8176453231041605, "grad_norm": 0.25008779536250064, "learning_rate": 2.1901639344262295e-05, "loss": 0.482, "num_tokens": 932048771.0, "step": 6160 }, { "epoch": 1.8191206845677192, "grad_norm": 0.23842639557393, "learning_rate": 2.187431693989071e-05, "loss": 0.4748, "num_tokens": 932761257.0, "step": 6165 }, { "epoch": 1.8205960460312776, "grad_norm": 0.24665700667136106, "learning_rate": 2.1846994535519128e-05, "loss": 0.4627, "num_tokens": 933428010.0, "step": 6170 }, { "epoch": 1.8220714074948363, "grad_norm": 0.23334604133568346, "learning_rate": 2.181967213114754e-05, "loss": 0.4835, "num_tokens": 934221326.0, "step": 6175 }, { "epoch": 1.8235467689583948, "grad_norm": 0.25252197675000365, "learning_rate": 2.1792349726775958e-05, "loss": 0.4925, "num_tokens": 935021245.0, "step": 6180 }, { "epoch": 1.8250221304219534, "grad_norm": 0.2507658797756519, "learning_rate": 2.176502732240437e-05, "loss": 0.4869, "num_tokens": 935832341.0, "step": 6185 }, { "epoch": 1.826497491885512, "grad_norm": 0.25128597299669175, "learning_rate": 2.173770491803279e-05, "loss": 0.4758, "num_tokens": 936578931.0, "step": 6190 }, { "epoch": 1.8279728533490704, "grad_norm": 0.2579655317009882, "learning_rate": 2.1710382513661205e-05, "loss": 0.4799, "num_tokens": 937341716.0, "step": 6195 }, { "epoch": 1.8294482148126292, "grad_norm": 0.24397807990909337, "learning_rate": 2.168306010928962e-05, "loss": 0.4755, "num_tokens": 938060077.0, "step": 6200 }, { "epoch": 1.8309235762761875, "grad_norm": 0.2419651387433952, "learning_rate": 2.165573770491803e-05, "loss": 0.4805, "num_tokens": 938843382.0, "step": 6205 }, { "epoch": 1.8323989377397463, "grad_norm": 0.2279947447262214, "learning_rate": 2.162841530054645e-05, "loss": 0.4657, "num_tokens": 939664590.0, "step": 6210 }, { "epoch": 1.8338742992033048, "grad_norm": 0.26943793183246173, "learning_rate": 2.1601092896174865e-05, "loss": 0.4844, "num_tokens": 940365787.0, "step": 6215 }, { "epoch": 1.8353496606668633, "grad_norm": 0.26517343755579426, "learning_rate": 2.1573770491803282e-05, "loss": 0.4855, "num_tokens": 941146565.0, "step": 6220 }, { "epoch": 1.836825022130422, "grad_norm": 0.23929383716269778, "learning_rate": 2.1546448087431695e-05, "loss": 0.4958, "num_tokens": 941897398.0, "step": 6225 }, { "epoch": 1.8383003835939804, "grad_norm": 0.21760795088365195, "learning_rate": 2.151912568306011e-05, "loss": 0.4579, "num_tokens": 942667507.0, "step": 6230 }, { "epoch": 1.8397757450575392, "grad_norm": 0.23726294928111016, "learning_rate": 2.1491803278688525e-05, "loss": 0.4735, "num_tokens": 943402164.0, "step": 6235 }, { "epoch": 1.8412511065210977, "grad_norm": 0.2578342842232093, "learning_rate": 2.1464480874316942e-05, "loss": 0.484, "num_tokens": 944124131.0, "step": 6240 }, { "epoch": 1.8427264679846562, "grad_norm": 0.25384072052561024, "learning_rate": 2.1437158469945355e-05, "loss": 0.4842, "num_tokens": 944931992.0, "step": 6245 }, { "epoch": 1.8442018294482148, "grad_norm": 0.24566602340153826, "learning_rate": 2.1409836065573772e-05, "loss": 0.4808, "num_tokens": 945654314.0, "step": 6250 }, { "epoch": 1.8456771909117733, "grad_norm": 0.23846153325260872, "learning_rate": 2.1382513661202185e-05, "loss": 0.4695, "num_tokens": 946374743.0, "step": 6255 }, { "epoch": 1.847152552375332, "grad_norm": 0.2292994949464841, "learning_rate": 2.1355191256830602e-05, "loss": 0.4823, "num_tokens": 947141208.0, "step": 6260 }, { "epoch": 1.8486279138388906, "grad_norm": 0.23681166305316212, "learning_rate": 2.132786885245902e-05, "loss": 0.4784, "num_tokens": 947915751.0, "step": 6265 }, { "epoch": 1.8501032753024491, "grad_norm": 0.23259718289474143, "learning_rate": 2.1300546448087432e-05, "loss": 0.492, "num_tokens": 948734167.0, "step": 6270 }, { "epoch": 1.8515786367660076, "grad_norm": 0.24330471667775783, "learning_rate": 2.127322404371585e-05, "loss": 0.4948, "num_tokens": 949505667.0, "step": 6275 }, { "epoch": 1.8530539982295662, "grad_norm": 0.23383114557349008, "learning_rate": 2.1245901639344262e-05, "loss": 0.4827, "num_tokens": 950253767.0, "step": 6280 }, { "epoch": 1.854529359693125, "grad_norm": 0.25783286945708417, "learning_rate": 2.121857923497268e-05, "loss": 0.479, "num_tokens": 951039488.0, "step": 6285 }, { "epoch": 1.8560047211566832, "grad_norm": 0.2431597085427363, "learning_rate": 2.1191256830601095e-05, "loss": 0.4896, "num_tokens": 951844347.0, "step": 6290 }, { "epoch": 1.857480082620242, "grad_norm": 0.2452688634411135, "learning_rate": 2.116393442622951e-05, "loss": 0.4842, "num_tokens": 952641535.0, "step": 6295 }, { "epoch": 1.8589554440838005, "grad_norm": 0.24863458739472827, "learning_rate": 2.1136612021857925e-05, "loss": 0.4879, "num_tokens": 953398259.0, "step": 6300 }, { "epoch": 1.860430805547359, "grad_norm": 0.254864824451964, "learning_rate": 2.110928961748634e-05, "loss": 0.4901, "num_tokens": 954217732.0, "step": 6305 }, { "epoch": 1.8619061670109178, "grad_norm": 0.24916130249645216, "learning_rate": 2.1081967213114755e-05, "loss": 0.4873, "num_tokens": 955027507.0, "step": 6310 }, { "epoch": 1.8633815284744761, "grad_norm": 0.2599932308148566, "learning_rate": 2.1054644808743172e-05, "loss": 0.4635, "num_tokens": 955732411.0, "step": 6315 }, { "epoch": 1.864856889938035, "grad_norm": 0.23384058817485856, "learning_rate": 2.1027322404371586e-05, "loss": 0.473, "num_tokens": 956482233.0, "step": 6320 }, { "epoch": 1.8663322514015934, "grad_norm": 0.22066216999800944, "learning_rate": 2.1e-05, "loss": 0.4823, "num_tokens": 957305621.0, "step": 6325 }, { "epoch": 1.867807612865152, "grad_norm": 0.2419320836437357, "learning_rate": 2.0972677595628416e-05, "loss": 0.4874, "num_tokens": 958077531.0, "step": 6330 }, { "epoch": 1.8692829743287105, "grad_norm": 0.22825941983196035, "learning_rate": 2.0945355191256832e-05, "loss": 0.4814, "num_tokens": 958916290.0, "step": 6335 }, { "epoch": 1.870758335792269, "grad_norm": 0.22197471641901045, "learning_rate": 2.0918032786885246e-05, "loss": 0.4604, "num_tokens": 959677928.0, "step": 6340 }, { "epoch": 1.8722336972558278, "grad_norm": 0.24880211393953147, "learning_rate": 2.0890710382513662e-05, "loss": 0.4888, "num_tokens": 960478834.0, "step": 6345 }, { "epoch": 1.873709058719386, "grad_norm": 0.23758873620224336, "learning_rate": 2.0863387978142076e-05, "loss": 0.4829, "num_tokens": 961246216.0, "step": 6350 }, { "epoch": 1.8751844201829448, "grad_norm": 0.2629522267624362, "learning_rate": 2.0836065573770496e-05, "loss": 0.4653, "num_tokens": 961957644.0, "step": 6355 }, { "epoch": 1.8766597816465034, "grad_norm": 0.24211968204617348, "learning_rate": 2.080874316939891e-05, "loss": 0.4793, "num_tokens": 962746432.0, "step": 6360 }, { "epoch": 1.878135143110062, "grad_norm": 0.24409625552226732, "learning_rate": 2.0781420765027322e-05, "loss": 0.469, "num_tokens": 963515808.0, "step": 6365 }, { "epoch": 1.8796105045736207, "grad_norm": 0.2242746102229429, "learning_rate": 2.075409836065574e-05, "loss": 0.4753, "num_tokens": 964296699.0, "step": 6370 }, { "epoch": 1.881085866037179, "grad_norm": 0.24882794882779694, "learning_rate": 2.0726775956284152e-05, "loss": 0.4908, "num_tokens": 965040252.0, "step": 6375 }, { "epoch": 1.8825612275007377, "grad_norm": 0.2471925714732687, "learning_rate": 2.069945355191257e-05, "loss": 0.4825, "num_tokens": 965802751.0, "step": 6380 }, { "epoch": 1.8840365889642963, "grad_norm": 0.24010720258343893, "learning_rate": 2.0672131147540986e-05, "loss": 0.4749, "num_tokens": 966584076.0, "step": 6385 }, { "epoch": 1.8855119504278548, "grad_norm": 0.2548283753159302, "learning_rate": 2.06448087431694e-05, "loss": 0.4712, "num_tokens": 967323037.0, "step": 6390 }, { "epoch": 1.8869873118914136, "grad_norm": 0.2714443329466118, "learning_rate": 2.0617486338797816e-05, "loss": 0.4588, "num_tokens": 968022924.0, "step": 6395 }, { "epoch": 1.8884626733549719, "grad_norm": 0.2339949667131456, "learning_rate": 2.059016393442623e-05, "loss": 0.4596, "num_tokens": 968733034.0, "step": 6400 }, { "epoch": 1.8899380348185306, "grad_norm": 0.24066185082079986, "learning_rate": 2.0562841530054646e-05, "loss": 0.4847, "num_tokens": 969485240.0, "step": 6405 }, { "epoch": 1.8914133962820892, "grad_norm": 0.2557021644279599, "learning_rate": 2.0535519125683063e-05, "loss": 0.4588, "num_tokens": 970211891.0, "step": 6410 }, { "epoch": 1.8928887577456477, "grad_norm": 0.2802313110486353, "learning_rate": 2.0508196721311476e-05, "loss": 0.4558, "num_tokens": 970900876.0, "step": 6415 }, { "epoch": 1.8943641192092062, "grad_norm": 0.24989925899599363, "learning_rate": 2.048087431693989e-05, "loss": 0.4745, "num_tokens": 971639624.0, "step": 6420 }, { "epoch": 1.8958394806727648, "grad_norm": 0.26454056927558084, "learning_rate": 2.0453551912568306e-05, "loss": 0.4663, "num_tokens": 972403895.0, "step": 6425 }, { "epoch": 1.8973148421363235, "grad_norm": 0.24260729934968076, "learning_rate": 2.0426229508196723e-05, "loss": 0.4665, "num_tokens": 973205457.0, "step": 6430 }, { "epoch": 1.8987902035998818, "grad_norm": 0.25381136131989473, "learning_rate": 2.039890710382514e-05, "loss": 0.474, "num_tokens": 973946097.0, "step": 6435 }, { "epoch": 1.9002655650634406, "grad_norm": 0.2391011420136122, "learning_rate": 2.0371584699453553e-05, "loss": 0.4867, "num_tokens": 974724271.0, "step": 6440 }, { "epoch": 1.901740926526999, "grad_norm": 0.23933815667266337, "learning_rate": 2.0344262295081966e-05, "loss": 0.4835, "num_tokens": 975478438.0, "step": 6445 }, { "epoch": 1.9032162879905576, "grad_norm": 0.24378907300002917, "learning_rate": 2.0316939890710386e-05, "loss": 0.4758, "num_tokens": 976216187.0, "step": 6450 }, { "epoch": 1.9046916494541164, "grad_norm": 0.23618750990843151, "learning_rate": 2.02896174863388e-05, "loss": 0.4807, "num_tokens": 977014316.0, "step": 6455 }, { "epoch": 1.9061670109176747, "grad_norm": 0.27353309500353107, "learning_rate": 2.0262295081967213e-05, "loss": 0.4525, "num_tokens": 977729560.0, "step": 6460 }, { "epoch": 1.9076423723812335, "grad_norm": 0.23481307930237175, "learning_rate": 2.023497267759563e-05, "loss": 0.4753, "num_tokens": 978521712.0, "step": 6465 }, { "epoch": 1.909117733844792, "grad_norm": 0.22298387336657643, "learning_rate": 2.0207650273224043e-05, "loss": 0.4898, "num_tokens": 979343575.0, "step": 6470 }, { "epoch": 1.9105930953083505, "grad_norm": 0.2280426612500138, "learning_rate": 2.018032786885246e-05, "loss": 0.4899, "num_tokens": 980136739.0, "step": 6475 }, { "epoch": 1.912068456771909, "grad_norm": 0.23475863792252966, "learning_rate": 2.0153005464480876e-05, "loss": 0.4682, "num_tokens": 980905296.0, "step": 6480 }, { "epoch": 1.9135438182354676, "grad_norm": 0.2574186690081149, "learning_rate": 2.012568306010929e-05, "loss": 0.4858, "num_tokens": 981668577.0, "step": 6485 }, { "epoch": 1.9150191796990264, "grad_norm": 0.23651213037311916, "learning_rate": 2.0098360655737706e-05, "loss": 0.4723, "num_tokens": 982489123.0, "step": 6490 }, { "epoch": 1.9164945411625849, "grad_norm": 0.2572625991415358, "learning_rate": 2.007103825136612e-05, "loss": 0.4819, "num_tokens": 983287251.0, "step": 6495 }, { "epoch": 1.9179699026261434, "grad_norm": 0.24351348232793096, "learning_rate": 2.0043715846994536e-05, "loss": 0.4907, "num_tokens": 984060434.0, "step": 6500 }, { "epoch": 1.919445264089702, "grad_norm": 0.23777272159129842, "learning_rate": 2.0016393442622953e-05, "loss": 0.4611, "num_tokens": 984770431.0, "step": 6505 }, { "epoch": 1.9209206255532605, "grad_norm": 0.24723936220169343, "learning_rate": 1.9989071038251366e-05, "loss": 0.4768, "num_tokens": 985510057.0, "step": 6510 }, { "epoch": 1.9223959870168192, "grad_norm": 0.2501910180867842, "learning_rate": 1.9961748633879783e-05, "loss": 0.4706, "num_tokens": 986191087.0, "step": 6515 }, { "epoch": 1.9238713484803776, "grad_norm": 0.24498252390636252, "learning_rate": 1.99344262295082e-05, "loss": 0.4858, "num_tokens": 986912884.0, "step": 6520 }, { "epoch": 1.9253467099439363, "grad_norm": 0.2403028537184044, "learning_rate": 1.9907103825136613e-05, "loss": 0.4848, "num_tokens": 987701031.0, "step": 6525 }, { "epoch": 1.9268220714074948, "grad_norm": 0.2856102844144801, "learning_rate": 1.987978142076503e-05, "loss": 0.4745, "num_tokens": 988471261.0, "step": 6530 }, { "epoch": 1.9282974328710534, "grad_norm": 0.2179913690447259, "learning_rate": 1.9852459016393443e-05, "loss": 0.4697, "num_tokens": 989217967.0, "step": 6535 }, { "epoch": 1.9297727943346121, "grad_norm": 0.2448551011364173, "learning_rate": 1.9825136612021856e-05, "loss": 0.4697, "num_tokens": 989900508.0, "step": 6540 }, { "epoch": 1.9312481557981704, "grad_norm": 0.24867082758111425, "learning_rate": 1.9797814207650276e-05, "loss": 0.4847, "num_tokens": 990633136.0, "step": 6545 }, { "epoch": 1.9327235172617292, "grad_norm": 0.24194718252450081, "learning_rate": 1.977049180327869e-05, "loss": 0.4868, "num_tokens": 991446675.0, "step": 6550 }, { "epoch": 1.9341988787252877, "grad_norm": 0.26670195504663124, "learning_rate": 1.9743169398907107e-05, "loss": 0.4659, "num_tokens": 992151810.0, "step": 6555 }, { "epoch": 1.9356742401888463, "grad_norm": 0.24595687436506616, "learning_rate": 1.971584699453552e-05, "loss": 0.4907, "num_tokens": 992904760.0, "step": 6560 }, { "epoch": 1.9371496016524048, "grad_norm": 0.2361891349278024, "learning_rate": 1.9688524590163933e-05, "loss": 0.4668, "num_tokens": 993670123.0, "step": 6565 }, { "epoch": 1.9386249631159633, "grad_norm": 0.23466698675969522, "learning_rate": 1.9661202185792353e-05, "loss": 0.4833, "num_tokens": 994428863.0, "step": 6570 }, { "epoch": 1.940100324579522, "grad_norm": 0.239676155056026, "learning_rate": 1.9633879781420767e-05, "loss": 0.4707, "num_tokens": 995156385.0, "step": 6575 }, { "epoch": 1.9415756860430804, "grad_norm": 0.2602668471125977, "learning_rate": 1.960655737704918e-05, "loss": 0.4832, "num_tokens": 995904720.0, "step": 6580 }, { "epoch": 1.9430510475066392, "grad_norm": 0.24376819916405731, "learning_rate": 1.9579234972677597e-05, "loss": 0.4665, "num_tokens": 996658039.0, "step": 6585 }, { "epoch": 1.9445264089701977, "grad_norm": 0.24398327907180561, "learning_rate": 1.955191256830601e-05, "loss": 0.4764, "num_tokens": 997440549.0, "step": 6590 }, { "epoch": 1.9460017704337562, "grad_norm": 0.25400802028333613, "learning_rate": 1.9524590163934427e-05, "loss": 0.4802, "num_tokens": 998224275.0, "step": 6595 }, { "epoch": 1.947477131897315, "grad_norm": 0.22989757967807284, "learning_rate": 1.9497267759562843e-05, "loss": 0.4694, "num_tokens": 999024434.0, "step": 6600 }, { "epoch": 1.9489524933608733, "grad_norm": 0.24946630207285417, "learning_rate": 1.9469945355191257e-05, "loss": 0.486, "num_tokens": 999800696.0, "step": 6605 }, { "epoch": 1.950427854824432, "grad_norm": 0.2686854012637484, "learning_rate": 1.9442622950819673e-05, "loss": 0.4682, "num_tokens": 1000576000.0, "step": 6610 }, { "epoch": 1.9519032162879906, "grad_norm": 0.23760448900986572, "learning_rate": 1.941530054644809e-05, "loss": 0.4741, "num_tokens": 1001352149.0, "step": 6615 }, { "epoch": 1.953378577751549, "grad_norm": 0.246515021399075, "learning_rate": 1.9387978142076503e-05, "loss": 0.4918, "num_tokens": 1002163158.0, "step": 6620 }, { "epoch": 1.9548539392151079, "grad_norm": 0.2513572995885258, "learning_rate": 1.936065573770492e-05, "loss": 0.4639, "num_tokens": 1002907406.0, "step": 6625 }, { "epoch": 1.9563293006786662, "grad_norm": 0.2480055114317128, "learning_rate": 1.9333333333333333e-05, "loss": 0.466, "num_tokens": 1003666810.0, "step": 6630 }, { "epoch": 1.957804662142225, "grad_norm": 0.27488902684208916, "learning_rate": 1.930601092896175e-05, "loss": 0.4826, "num_tokens": 1004444121.0, "step": 6635 }, { "epoch": 1.9592800236057835, "grad_norm": 0.2548202897987375, "learning_rate": 1.9278688524590167e-05, "loss": 0.4799, "num_tokens": 1005210206.0, "step": 6640 }, { "epoch": 1.960755385069342, "grad_norm": 0.26586832257698406, "learning_rate": 1.925136612021858e-05, "loss": 0.4666, "num_tokens": 1005977436.0, "step": 6645 }, { "epoch": 1.9622307465329005, "grad_norm": 0.2652956565301442, "learning_rate": 1.9224043715846997e-05, "loss": 0.4744, "num_tokens": 1006709791.0, "step": 6650 }, { "epoch": 1.963706107996459, "grad_norm": 0.24182345307460598, "learning_rate": 1.919672131147541e-05, "loss": 0.4718, "num_tokens": 1007420025.0, "step": 6655 }, { "epoch": 1.9651814694600178, "grad_norm": 0.22753603421170845, "learning_rate": 1.9169398907103824e-05, "loss": 0.4727, "num_tokens": 1008198484.0, "step": 6660 }, { "epoch": 1.9666568309235761, "grad_norm": 0.25059915653744913, "learning_rate": 1.9142076502732244e-05, "loss": 0.4616, "num_tokens": 1008989272.0, "step": 6665 }, { "epoch": 1.9681321923871349, "grad_norm": 0.26334343909949026, "learning_rate": 1.9114754098360657e-05, "loss": 0.4749, "num_tokens": 1009705522.0, "step": 6670 }, { "epoch": 1.9696075538506934, "grad_norm": 0.22347223639116412, "learning_rate": 1.908743169398907e-05, "loss": 0.4794, "num_tokens": 1010569704.0, "step": 6675 }, { "epoch": 1.971082915314252, "grad_norm": 0.21849402523604075, "learning_rate": 1.9060109289617487e-05, "loss": 0.4826, "num_tokens": 1011381904.0, "step": 6680 }, { "epoch": 1.9725582767778107, "grad_norm": 0.2441506276252297, "learning_rate": 1.90327868852459e-05, "loss": 0.4578, "num_tokens": 1012148404.0, "step": 6685 }, { "epoch": 1.974033638241369, "grad_norm": 0.23259409315956758, "learning_rate": 1.900546448087432e-05, "loss": 0.4704, "num_tokens": 1012884130.0, "step": 6690 }, { "epoch": 1.9755089997049278, "grad_norm": 0.26171705642164855, "learning_rate": 1.8978142076502734e-05, "loss": 0.4898, "num_tokens": 1013673396.0, "step": 6695 }, { "epoch": 1.9769843611684863, "grad_norm": 0.24692643635295877, "learning_rate": 1.8950819672131147e-05, "loss": 0.4859, "num_tokens": 1014492990.0, "step": 6700 }, { "epoch": 1.9784597226320448, "grad_norm": 0.24619975794407445, "learning_rate": 1.8923497267759564e-05, "loss": 0.4681, "num_tokens": 1015206792.0, "step": 6705 }, { "epoch": 1.9799350840956034, "grad_norm": 0.2550531215678855, "learning_rate": 1.889617486338798e-05, "loss": 0.4884, "num_tokens": 1015997950.0, "step": 6710 }, { "epoch": 1.981410445559162, "grad_norm": 0.24337359647517162, "learning_rate": 1.8868852459016394e-05, "loss": 0.4847, "num_tokens": 1016778567.0, "step": 6715 }, { "epoch": 1.9828858070227207, "grad_norm": 0.2343052651351182, "learning_rate": 1.884153005464481e-05, "loss": 0.455, "num_tokens": 1017452302.0, "step": 6720 }, { "epoch": 1.9843611684862792, "grad_norm": 0.2573809769312881, "learning_rate": 1.8814207650273224e-05, "loss": 0.4508, "num_tokens": 1018118590.0, "step": 6725 }, { "epoch": 1.9858365299498377, "grad_norm": 0.2473103799965538, "learning_rate": 1.878688524590164e-05, "loss": 0.4648, "num_tokens": 1018852895.0, "step": 6730 }, { "epoch": 1.9873118914133963, "grad_norm": 0.24481623761631435, "learning_rate": 1.8759562841530057e-05, "loss": 0.4755, "num_tokens": 1019635017.0, "step": 6735 }, { "epoch": 1.9887872528769548, "grad_norm": 0.24797667844706184, "learning_rate": 1.873224043715847e-05, "loss": 0.4741, "num_tokens": 1020381831.0, "step": 6740 }, { "epoch": 1.9902626143405135, "grad_norm": 0.26500538124231604, "learning_rate": 1.8704918032786887e-05, "loss": 0.4925, "num_tokens": 1021190394.0, "step": 6745 }, { "epoch": 1.9917379758040719, "grad_norm": 0.23031750424341596, "learning_rate": 1.86775956284153e-05, "loss": 0.4682, "num_tokens": 1021951407.0, "step": 6750 }, { "epoch": 1.9932133372676306, "grad_norm": 0.34509489467992177, "learning_rate": 1.8650273224043714e-05, "loss": 0.4824, "num_tokens": 1022725590.0, "step": 6755 }, { "epoch": 1.9946886987311891, "grad_norm": 0.24652552888909646, "learning_rate": 1.8622950819672134e-05, "loss": 0.4608, "num_tokens": 1023486243.0, "step": 6760 }, { "epoch": 1.9961640601947477, "grad_norm": 0.24721764872699709, "learning_rate": 1.8595628415300547e-05, "loss": 0.4838, "num_tokens": 1024230998.0, "step": 6765 }, { "epoch": 1.9976394216583064, "grad_norm": 0.2277186916175625, "learning_rate": 1.8568306010928964e-05, "loss": 0.4647, "num_tokens": 1024981902.0, "step": 6770 }, { "epoch": 1.9991147831218647, "grad_norm": 0.23991278669595162, "learning_rate": 1.8540983606557377e-05, "loss": 0.4947, "num_tokens": 1025791006.0, "step": 6775 }, { "epoch": 2.0005901445854235, "grad_norm": 0.28026917160148535, "learning_rate": 1.851366120218579e-05, "loss": 0.4356, "num_tokens": 1026478952.0, "step": 6780 }, { "epoch": 2.002065506048982, "grad_norm": 0.2532467849562269, "learning_rate": 1.848633879781421e-05, "loss": 0.3987, "num_tokens": 1027261840.0, "step": 6785 }, { "epoch": 2.0035408675125406, "grad_norm": 0.26494816372175584, "learning_rate": 1.8459016393442624e-05, "loss": 0.3942, "num_tokens": 1028055405.0, "step": 6790 }, { "epoch": 2.0050162289760993, "grad_norm": 0.27705675022808757, "learning_rate": 1.8431693989071037e-05, "loss": 0.4045, "num_tokens": 1028825767.0, "step": 6795 }, { "epoch": 2.0064915904396576, "grad_norm": 0.2877260272355745, "learning_rate": 1.8404371584699454e-05, "loss": 0.383, "num_tokens": 1029492945.0, "step": 6800 }, { "epoch": 2.0079669519032164, "grad_norm": 0.2550182567703657, "learning_rate": 1.837704918032787e-05, "loss": 0.3984, "num_tokens": 1030304620.0, "step": 6805 }, { "epoch": 2.0094423133667747, "grad_norm": 0.2855997098565556, "learning_rate": 1.8349726775956284e-05, "loss": 0.408, "num_tokens": 1031085739.0, "step": 6810 }, { "epoch": 2.0109176748303335, "grad_norm": 0.2692346244536945, "learning_rate": 1.83224043715847e-05, "loss": 0.3961, "num_tokens": 1031814651.0, "step": 6815 }, { "epoch": 2.012393036293892, "grad_norm": 0.29184407370714704, "learning_rate": 1.8295081967213114e-05, "loss": 0.398, "num_tokens": 1032502990.0, "step": 6820 }, { "epoch": 2.0138683977574505, "grad_norm": 0.27512732237466786, "learning_rate": 1.826775956284153e-05, "loss": 0.3978, "num_tokens": 1033232157.0, "step": 6825 }, { "epoch": 2.0153437592210093, "grad_norm": 0.26060320394227354, "learning_rate": 1.8240437158469948e-05, "loss": 0.402, "num_tokens": 1034042298.0, "step": 6830 }, { "epoch": 2.0168191206845676, "grad_norm": 0.2801319276820709, "learning_rate": 1.821311475409836e-05, "loss": 0.396, "num_tokens": 1034835127.0, "step": 6835 }, { "epoch": 2.0182944821481263, "grad_norm": 0.25547453848467855, "learning_rate": 1.8185792349726778e-05, "loss": 0.388, "num_tokens": 1035574871.0, "step": 6840 }, { "epoch": 2.0197698436116847, "grad_norm": 0.25659776212927293, "learning_rate": 1.815846994535519e-05, "loss": 0.4077, "num_tokens": 1036401620.0, "step": 6845 }, { "epoch": 2.0212452050752434, "grad_norm": 0.2644063768492194, "learning_rate": 1.8131147540983608e-05, "loss": 0.4029, "num_tokens": 1037200517.0, "step": 6850 }, { "epoch": 2.022720566538802, "grad_norm": 0.2715458314245296, "learning_rate": 1.8103825136612024e-05, "loss": 0.3969, "num_tokens": 1037984866.0, "step": 6855 }, { "epoch": 2.0241959280023605, "grad_norm": 0.27379747863541115, "learning_rate": 1.8076502732240438e-05, "loss": 0.4113, "num_tokens": 1038752991.0, "step": 6860 }, { "epoch": 2.0256712894659192, "grad_norm": 0.2834798506507569, "learning_rate": 1.8049180327868854e-05, "loss": 0.3992, "num_tokens": 1039541958.0, "step": 6865 }, { "epoch": 2.0271466509294775, "grad_norm": 0.27437280540598935, "learning_rate": 1.8021857923497268e-05, "loss": 0.3923, "num_tokens": 1040275743.0, "step": 6870 }, { "epoch": 2.0286220123930363, "grad_norm": 0.2591272038015355, "learning_rate": 1.799453551912568e-05, "loss": 0.3936, "num_tokens": 1041028655.0, "step": 6875 }, { "epoch": 2.030097373856595, "grad_norm": 0.2704031391704667, "learning_rate": 1.79672131147541e-05, "loss": 0.3874, "num_tokens": 1041778139.0, "step": 6880 }, { "epoch": 2.0315727353201534, "grad_norm": 0.2540553088362626, "learning_rate": 1.7939890710382515e-05, "loss": 0.3894, "num_tokens": 1042551965.0, "step": 6885 }, { "epoch": 2.033048096783712, "grad_norm": 0.27677572787912497, "learning_rate": 1.7912568306010928e-05, "loss": 0.4059, "num_tokens": 1043318628.0, "step": 6890 }, { "epoch": 2.0345234582472704, "grad_norm": 0.2599440799846538, "learning_rate": 1.7885245901639345e-05, "loss": 0.4095, "num_tokens": 1044094648.0, "step": 6895 }, { "epoch": 2.035998819710829, "grad_norm": 0.2557950985260886, "learning_rate": 1.785792349726776e-05, "loss": 0.4088, "num_tokens": 1044887477.0, "step": 6900 }, { "epoch": 2.037474181174388, "grad_norm": 0.2692112128031832, "learning_rate": 1.7830601092896178e-05, "loss": 0.4067, "num_tokens": 1045672509.0, "step": 6905 }, { "epoch": 2.0389495426379463, "grad_norm": 0.2896551698505846, "learning_rate": 1.780327868852459e-05, "loss": 0.3875, "num_tokens": 1046369951.0, "step": 6910 }, { "epoch": 2.040424904101505, "grad_norm": 0.27430896906966024, "learning_rate": 1.7775956284153005e-05, "loss": 0.3967, "num_tokens": 1047122967.0, "step": 6915 }, { "epoch": 2.0419002655650633, "grad_norm": 0.2608687855534625, "learning_rate": 1.774863387978142e-05, "loss": 0.3796, "num_tokens": 1047817485.0, "step": 6920 }, { "epoch": 2.043375627028622, "grad_norm": 0.26073523917355274, "learning_rate": 1.7721311475409838e-05, "loss": 0.4106, "num_tokens": 1048562762.0, "step": 6925 }, { "epoch": 2.0448509884921804, "grad_norm": 0.24791707127967458, "learning_rate": 1.769398907103825e-05, "loss": 0.4016, "num_tokens": 1049347964.0, "step": 6930 }, { "epoch": 2.046326349955739, "grad_norm": 0.26510466663869703, "learning_rate": 1.7666666666666668e-05, "loss": 0.4226, "num_tokens": 1050207637.0, "step": 6935 }, { "epoch": 2.047801711419298, "grad_norm": 0.26296551610208424, "learning_rate": 1.763934426229508e-05, "loss": 0.3984, "num_tokens": 1050966956.0, "step": 6940 }, { "epoch": 2.049277072882856, "grad_norm": 0.26454970559585933, "learning_rate": 1.7612021857923498e-05, "loss": 0.3988, "num_tokens": 1051691970.0, "step": 6945 }, { "epoch": 2.050752434346415, "grad_norm": 0.31589446246506214, "learning_rate": 1.7584699453551915e-05, "loss": 0.3848, "num_tokens": 1052448433.0, "step": 6950 }, { "epoch": 2.0522277958099733, "grad_norm": 0.29867277757317884, "learning_rate": 1.7557377049180328e-05, "loss": 0.4001, "num_tokens": 1053187718.0, "step": 6955 }, { "epoch": 2.053703157273532, "grad_norm": 0.28178646514596695, "learning_rate": 1.7530054644808745e-05, "loss": 0.3927, "num_tokens": 1053926988.0, "step": 6960 }, { "epoch": 2.055178518737091, "grad_norm": 0.2641508871704113, "learning_rate": 1.7502732240437158e-05, "loss": 0.4028, "num_tokens": 1054709878.0, "step": 6965 }, { "epoch": 2.056653880200649, "grad_norm": 0.28234141432690885, "learning_rate": 1.7475409836065575e-05, "loss": 0.4138, "num_tokens": 1055526415.0, "step": 6970 }, { "epoch": 2.058129241664208, "grad_norm": 0.2738570895065596, "learning_rate": 1.744808743169399e-05, "loss": 0.3941, "num_tokens": 1056251867.0, "step": 6975 }, { "epoch": 2.059604603127766, "grad_norm": 0.2650019355057638, "learning_rate": 1.7420765027322405e-05, "loss": 0.3884, "num_tokens": 1056996036.0, "step": 6980 }, { "epoch": 2.061079964591325, "grad_norm": 0.2708100866157162, "learning_rate": 1.739344262295082e-05, "loss": 0.4083, "num_tokens": 1057749690.0, "step": 6985 }, { "epoch": 2.0625553260548832, "grad_norm": 0.25692870101854764, "learning_rate": 1.7366120218579235e-05, "loss": 0.3903, "num_tokens": 1058522531.0, "step": 6990 }, { "epoch": 2.064030687518442, "grad_norm": 0.2512646337452096, "learning_rate": 1.733879781420765e-05, "loss": 0.4022, "num_tokens": 1059350753.0, "step": 6995 }, { "epoch": 2.0655060489820007, "grad_norm": 0.27262906343131593, "learning_rate": 1.731147540983607e-05, "loss": 0.3944, "num_tokens": 1060127651.0, "step": 7000 }, { "epoch": 2.066981410445559, "grad_norm": 0.28289869770185727, "learning_rate": 1.728415300546448e-05, "loss": 0.3941, "num_tokens": 1060877683.0, "step": 7005 }, { "epoch": 2.068456771909118, "grad_norm": 0.26388258993950614, "learning_rate": 1.7256830601092895e-05, "loss": 0.3887, "num_tokens": 1061607448.0, "step": 7010 }, { "epoch": 2.069932133372676, "grad_norm": 0.2983622029869106, "learning_rate": 1.7229508196721312e-05, "loss": 0.3905, "num_tokens": 1062313231.0, "step": 7015 }, { "epoch": 2.071407494836235, "grad_norm": 0.27307707788264135, "learning_rate": 1.720218579234973e-05, "loss": 0.3956, "num_tokens": 1063123308.0, "step": 7020 }, { "epoch": 2.0728828562997936, "grad_norm": 0.25698560771674256, "learning_rate": 1.7174863387978145e-05, "loss": 0.4172, "num_tokens": 1063958019.0, "step": 7025 }, { "epoch": 2.074358217763352, "grad_norm": 0.26895670944821065, "learning_rate": 1.714754098360656e-05, "loss": 0.391, "num_tokens": 1064703271.0, "step": 7030 }, { "epoch": 2.0758335792269107, "grad_norm": 0.24219443161821547, "learning_rate": 1.7120218579234972e-05, "loss": 0.4084, "num_tokens": 1065507752.0, "step": 7035 }, { "epoch": 2.077308940690469, "grad_norm": 0.27826517907025644, "learning_rate": 1.709289617486339e-05, "loss": 0.3944, "num_tokens": 1066201381.0, "step": 7040 }, { "epoch": 2.0787843021540278, "grad_norm": 0.2630026210341555, "learning_rate": 1.7065573770491805e-05, "loss": 0.4056, "num_tokens": 1066991245.0, "step": 7045 }, { "epoch": 2.0802596636175865, "grad_norm": 0.26127481188596374, "learning_rate": 1.703825136612022e-05, "loss": 0.4259, "num_tokens": 1067813069.0, "step": 7050 }, { "epoch": 2.081735025081145, "grad_norm": 0.2773717958701698, "learning_rate": 1.7010928961748635e-05, "loss": 0.403, "num_tokens": 1068524555.0, "step": 7055 }, { "epoch": 2.0832103865447036, "grad_norm": 0.2695189971358803, "learning_rate": 1.698360655737705e-05, "loss": 0.4114, "num_tokens": 1069355778.0, "step": 7060 }, { "epoch": 2.084685748008262, "grad_norm": 0.27193650134149083, "learning_rate": 1.6956284153005465e-05, "loss": 0.3974, "num_tokens": 1070079448.0, "step": 7065 }, { "epoch": 2.0861611094718207, "grad_norm": 0.28979574770517025, "learning_rate": 1.6928961748633882e-05, "loss": 0.398, "num_tokens": 1070825515.0, "step": 7070 }, { "epoch": 2.087636470935379, "grad_norm": 0.24646672623943996, "learning_rate": 1.6901639344262295e-05, "loss": 0.4054, "num_tokens": 1071670494.0, "step": 7075 }, { "epoch": 2.0891118323989377, "grad_norm": 0.2747287177491863, "learning_rate": 1.6874316939890712e-05, "loss": 0.4076, "num_tokens": 1072451303.0, "step": 7080 }, { "epoch": 2.0905871938624965, "grad_norm": 0.2626502667163471, "learning_rate": 1.6846994535519125e-05, "loss": 0.4051, "num_tokens": 1073247513.0, "step": 7085 }, { "epoch": 2.092062555326055, "grad_norm": 0.29882907730593367, "learning_rate": 1.6819672131147542e-05, "loss": 0.4036, "num_tokens": 1074048612.0, "step": 7090 }, { "epoch": 2.0935379167896135, "grad_norm": 0.2737139619011727, "learning_rate": 1.679234972677596e-05, "loss": 0.3964, "num_tokens": 1074784482.0, "step": 7095 }, { "epoch": 2.095013278253172, "grad_norm": 0.269825119008097, "learning_rate": 1.6765027322404372e-05, "loss": 0.4143, "num_tokens": 1075562615.0, "step": 7100 }, { "epoch": 2.0964886397167306, "grad_norm": 0.28526906585427786, "learning_rate": 1.673770491803279e-05, "loss": 0.3699, "num_tokens": 1076286908.0, "step": 7105 }, { "epoch": 2.0979640011802894, "grad_norm": 0.2698975151815133, "learning_rate": 1.6710382513661202e-05, "loss": 0.3975, "num_tokens": 1077078628.0, "step": 7110 }, { "epoch": 2.0994393626438477, "grad_norm": 0.29664593373079884, "learning_rate": 1.668306010928962e-05, "loss": 0.4013, "num_tokens": 1077853666.0, "step": 7115 }, { "epoch": 2.1009147241074064, "grad_norm": 0.28014455040835545, "learning_rate": 1.6655737704918036e-05, "loss": 0.378, "num_tokens": 1078536196.0, "step": 7120 }, { "epoch": 2.1023900855709647, "grad_norm": 0.2743218054198271, "learning_rate": 1.662841530054645e-05, "loss": 0.3807, "num_tokens": 1079245913.0, "step": 7125 }, { "epoch": 2.1038654470345235, "grad_norm": 0.2754802405047418, "learning_rate": 1.6601092896174862e-05, "loss": 0.412, "num_tokens": 1079998403.0, "step": 7130 }, { "epoch": 2.105340808498082, "grad_norm": 0.3152576617539754, "learning_rate": 1.657377049180328e-05, "loss": 0.3818, "num_tokens": 1080692337.0, "step": 7135 }, { "epoch": 2.1068161699616406, "grad_norm": 0.28589951491102006, "learning_rate": 1.6546448087431696e-05, "loss": 0.4077, "num_tokens": 1081415275.0, "step": 7140 }, { "epoch": 2.1082915314251993, "grad_norm": 0.28248083964770626, "learning_rate": 1.651912568306011e-05, "loss": 0.3925, "num_tokens": 1082198138.0, "step": 7145 }, { "epoch": 2.1097668928887576, "grad_norm": 0.2591548111845972, "learning_rate": 1.6491803278688526e-05, "loss": 0.394, "num_tokens": 1082968968.0, "step": 7150 }, { "epoch": 2.1112422543523164, "grad_norm": 0.29571065274145547, "learning_rate": 1.646448087431694e-05, "loss": 0.4024, "num_tokens": 1083708595.0, "step": 7155 }, { "epoch": 2.1127176158158747, "grad_norm": 0.29069026869945636, "learning_rate": 1.643715846994536e-05, "loss": 0.3825, "num_tokens": 1084415755.0, "step": 7160 }, { "epoch": 2.1141929772794334, "grad_norm": 0.24458994612829715, "learning_rate": 1.6409836065573772e-05, "loss": 0.3928, "num_tokens": 1085190156.0, "step": 7165 }, { "epoch": 2.115668338742992, "grad_norm": 0.26802082241451813, "learning_rate": 1.6382513661202186e-05, "loss": 0.394, "num_tokens": 1085983168.0, "step": 7170 }, { "epoch": 2.1171437002065505, "grad_norm": 0.2577095117071163, "learning_rate": 1.6355191256830602e-05, "loss": 0.4052, "num_tokens": 1086837207.0, "step": 7175 }, { "epoch": 2.1186190616701093, "grad_norm": 0.28939611949468036, "learning_rate": 1.6327868852459016e-05, "loss": 0.3925, "num_tokens": 1087583142.0, "step": 7180 }, { "epoch": 2.1200944231336676, "grad_norm": 0.2724476771178842, "learning_rate": 1.6300546448087432e-05, "loss": 0.3862, "num_tokens": 1088332287.0, "step": 7185 }, { "epoch": 2.1215697845972263, "grad_norm": 0.2609384266649422, "learning_rate": 1.627322404371585e-05, "loss": 0.4209, "num_tokens": 1089121108.0, "step": 7190 }, { "epoch": 2.123045146060785, "grad_norm": 0.2812726297262784, "learning_rate": 1.6245901639344262e-05, "loss": 0.4096, "num_tokens": 1089876046.0, "step": 7195 }, { "epoch": 2.1245205075243434, "grad_norm": 0.25675561833274424, "learning_rate": 1.621857923497268e-05, "loss": 0.404, "num_tokens": 1090628906.0, "step": 7200 }, { "epoch": 2.125995868987902, "grad_norm": 0.28517479256327255, "learning_rate": 1.6191256830601092e-05, "loss": 0.406, "num_tokens": 1091368976.0, "step": 7205 }, { "epoch": 2.1274712304514605, "grad_norm": 0.2763645246160652, "learning_rate": 1.616393442622951e-05, "loss": 0.4037, "num_tokens": 1092143671.0, "step": 7210 }, { "epoch": 2.1289465919150192, "grad_norm": 0.2536453246098775, "learning_rate": 1.6136612021857926e-05, "loss": 0.4119, "num_tokens": 1092897416.0, "step": 7215 }, { "epoch": 2.130421953378578, "grad_norm": 0.28371825281959984, "learning_rate": 1.610928961748634e-05, "loss": 0.4169, "num_tokens": 1093653634.0, "step": 7220 }, { "epoch": 2.1318973148421363, "grad_norm": 0.2877364614857359, "learning_rate": 1.6081967213114753e-05, "loss": 0.4112, "num_tokens": 1094414620.0, "step": 7225 }, { "epoch": 2.133372676305695, "grad_norm": 0.3535047560039283, "learning_rate": 1.605464480874317e-05, "loss": 0.3914, "num_tokens": 1095124938.0, "step": 7230 }, { "epoch": 2.1348480377692534, "grad_norm": 0.2743827509966635, "learning_rate": 1.6027322404371586e-05, "loss": 0.4057, "num_tokens": 1095918854.0, "step": 7235 }, { "epoch": 2.136323399232812, "grad_norm": 0.2527260029176411, "learning_rate": 1.6000000000000003e-05, "loss": 0.3769, "num_tokens": 1096625685.0, "step": 7240 }, { "epoch": 2.1377987606963704, "grad_norm": 0.2830146158676854, "learning_rate": 1.5972677595628416e-05, "loss": 0.4064, "num_tokens": 1097424942.0, "step": 7245 }, { "epoch": 2.139274122159929, "grad_norm": 0.25113823136547114, "learning_rate": 1.594535519125683e-05, "loss": 0.3819, "num_tokens": 1098217857.0, "step": 7250 }, { "epoch": 2.140749483623488, "grad_norm": 0.2835920117869575, "learning_rate": 1.591803278688525e-05, "loss": 0.4009, "num_tokens": 1098991383.0, "step": 7255 }, { "epoch": 2.1422248450870462, "grad_norm": 0.2645881875145541, "learning_rate": 1.5890710382513663e-05, "loss": 0.4011, "num_tokens": 1099733477.0, "step": 7260 }, { "epoch": 2.143700206550605, "grad_norm": 0.3008886611229066, "learning_rate": 1.5863387978142076e-05, "loss": 0.3907, "num_tokens": 1100414446.0, "step": 7265 }, { "epoch": 2.1451755680141633, "grad_norm": 0.29266905955210376, "learning_rate": 1.5836065573770493e-05, "loss": 0.4035, "num_tokens": 1101170772.0, "step": 7270 }, { "epoch": 2.146650929477722, "grad_norm": 0.27366586217252237, "learning_rate": 1.5808743169398906e-05, "loss": 0.3948, "num_tokens": 1101877657.0, "step": 7275 }, { "epoch": 2.1481262909412804, "grad_norm": 0.28478618969058794, "learning_rate": 1.5781420765027323e-05, "loss": 0.3803, "num_tokens": 1102584778.0, "step": 7280 }, { "epoch": 2.149601652404839, "grad_norm": 0.28109153539100895, "learning_rate": 1.575409836065574e-05, "loss": 0.3879, "num_tokens": 1103346356.0, "step": 7285 }, { "epoch": 2.151077013868398, "grad_norm": 0.26758579347321154, "learning_rate": 1.5726775956284153e-05, "loss": 0.3965, "num_tokens": 1104104141.0, "step": 7290 }, { "epoch": 2.152552375331956, "grad_norm": 0.3278656404983091, "learning_rate": 1.569945355191257e-05, "loss": 0.4045, "num_tokens": 1104826717.0, "step": 7295 }, { "epoch": 2.154027736795515, "grad_norm": 0.2549287204391056, "learning_rate": 1.5672131147540983e-05, "loss": 0.3946, "num_tokens": 1105595108.0, "step": 7300 }, { "epoch": 2.1555030982590733, "grad_norm": 0.2461285564963294, "learning_rate": 1.56448087431694e-05, "loss": 0.3853, "num_tokens": 1106368194.0, "step": 7305 }, { "epoch": 2.156978459722632, "grad_norm": 0.29761129077595166, "learning_rate": 1.5617486338797816e-05, "loss": 0.3823, "num_tokens": 1107093156.0, "step": 7310 }, { "epoch": 2.158453821186191, "grad_norm": 0.2677826548420208, "learning_rate": 1.559016393442623e-05, "loss": 0.3925, "num_tokens": 1107850365.0, "step": 7315 }, { "epoch": 2.159929182649749, "grad_norm": 0.26968893185194526, "learning_rate": 1.5562841530054646e-05, "loss": 0.4119, "num_tokens": 1108650597.0, "step": 7320 }, { "epoch": 2.161404544113308, "grad_norm": 0.29005084512331264, "learning_rate": 1.553551912568306e-05, "loss": 0.3957, "num_tokens": 1109366614.0, "step": 7325 }, { "epoch": 2.162879905576866, "grad_norm": 0.28557972592392306, "learning_rate": 1.5508196721311476e-05, "loss": 0.3887, "num_tokens": 1110146625.0, "step": 7330 }, { "epoch": 2.164355267040425, "grad_norm": 0.2712969116414119, "learning_rate": 1.5480874316939893e-05, "loss": 0.3984, "num_tokens": 1110943621.0, "step": 7335 }, { "epoch": 2.1658306285039837, "grad_norm": 0.271820762034211, "learning_rate": 1.5453551912568306e-05, "loss": 0.3892, "num_tokens": 1111667205.0, "step": 7340 }, { "epoch": 2.167305989967542, "grad_norm": 0.2902279805582313, "learning_rate": 1.542622950819672e-05, "loss": 0.4242, "num_tokens": 1112467494.0, "step": 7345 }, { "epoch": 2.1687813514311007, "grad_norm": 0.2747882070422075, "learning_rate": 1.539890710382514e-05, "loss": 0.4112, "num_tokens": 1113291086.0, "step": 7350 }, { "epoch": 2.170256712894659, "grad_norm": 0.2756939437218494, "learning_rate": 1.5371584699453553e-05, "loss": 0.3863, "num_tokens": 1114057535.0, "step": 7355 }, { "epoch": 2.171732074358218, "grad_norm": 0.3406508442753039, "learning_rate": 1.5344262295081966e-05, "loss": 0.3786, "num_tokens": 1114744816.0, "step": 7360 }, { "epoch": 2.1732074358217766, "grad_norm": 0.2965377091751263, "learning_rate": 1.5316939890710383e-05, "loss": 0.3912, "num_tokens": 1115503148.0, "step": 7365 }, { "epoch": 2.174682797285335, "grad_norm": 0.26269126380893537, "learning_rate": 1.5289617486338796e-05, "loss": 0.3905, "num_tokens": 1116276183.0, "step": 7370 }, { "epoch": 2.1761581587488936, "grad_norm": 0.2785162445760474, "learning_rate": 1.5262295081967217e-05, "loss": 0.3864, "num_tokens": 1117006447.0, "step": 7375 }, { "epoch": 2.177633520212452, "grad_norm": 0.2785355538204506, "learning_rate": 1.523497267759563e-05, "loss": 0.384, "num_tokens": 1117724790.0, "step": 7380 }, { "epoch": 2.1791088816760107, "grad_norm": 0.2508122034906474, "learning_rate": 1.5207650273224045e-05, "loss": 0.4111, "num_tokens": 1118568951.0, "step": 7385 }, { "epoch": 2.180584243139569, "grad_norm": 0.2646224367351018, "learning_rate": 1.518032786885246e-05, "loss": 0.3937, "num_tokens": 1119320643.0, "step": 7390 }, { "epoch": 2.1820596046031278, "grad_norm": 0.3347709759508545, "learning_rate": 1.5153005464480873e-05, "loss": 0.3844, "num_tokens": 1119961606.0, "step": 7395 }, { "epoch": 2.1835349660666865, "grad_norm": 0.26820211278925343, "learning_rate": 1.5125683060109292e-05, "loss": 0.3813, "num_tokens": 1120690290.0, "step": 7400 }, { "epoch": 2.185010327530245, "grad_norm": 0.2691882065508194, "learning_rate": 1.5098360655737707e-05, "loss": 0.3987, "num_tokens": 1121421698.0, "step": 7405 }, { "epoch": 2.1864856889938036, "grad_norm": 0.2633205719874947, "learning_rate": 1.507103825136612e-05, "loss": 0.3875, "num_tokens": 1122166668.0, "step": 7410 }, { "epoch": 2.187961050457362, "grad_norm": 0.2665022763400291, "learning_rate": 1.5043715846994535e-05, "loss": 0.4105, "num_tokens": 1122943207.0, "step": 7415 }, { "epoch": 2.1894364119209206, "grad_norm": 0.27496889483198794, "learning_rate": 1.501639344262295e-05, "loss": 0.3829, "num_tokens": 1123685004.0, "step": 7420 }, { "epoch": 2.1909117733844794, "grad_norm": 0.28384236932364504, "learning_rate": 1.4989071038251368e-05, "loss": 0.4009, "num_tokens": 1124440946.0, "step": 7425 }, { "epoch": 2.1923871348480377, "grad_norm": 0.2721879473572946, "learning_rate": 1.4961748633879782e-05, "loss": 0.4135, "num_tokens": 1125205425.0, "step": 7430 }, { "epoch": 2.1938624963115965, "grad_norm": 0.29813933613542387, "learning_rate": 1.4934426229508197e-05, "loss": 0.3985, "num_tokens": 1125896978.0, "step": 7435 }, { "epoch": 2.1953378577751548, "grad_norm": 0.2574735171196109, "learning_rate": 1.4907103825136612e-05, "loss": 0.4054, "num_tokens": 1126676655.0, "step": 7440 }, { "epoch": 2.1968132192387135, "grad_norm": 0.29373891851444306, "learning_rate": 1.487978142076503e-05, "loss": 0.4068, "num_tokens": 1127451857.0, "step": 7445 }, { "epoch": 2.198288580702272, "grad_norm": 0.28809207794526304, "learning_rate": 1.4852459016393443e-05, "loss": 0.3859, "num_tokens": 1128156416.0, "step": 7450 }, { "epoch": 2.1997639421658306, "grad_norm": 0.26593365920758183, "learning_rate": 1.4825136612021859e-05, "loss": 0.3889, "num_tokens": 1128894659.0, "step": 7455 }, { "epoch": 2.2012393036293894, "grad_norm": 0.2679145498661543, "learning_rate": 1.4797814207650274e-05, "loss": 0.4264, "num_tokens": 1129735093.0, "step": 7460 }, { "epoch": 2.2027146650929477, "grad_norm": 0.2697287771598474, "learning_rate": 1.4770491803278689e-05, "loss": 0.3819, "num_tokens": 1130461740.0, "step": 7465 }, { "epoch": 2.2041900265565064, "grad_norm": 0.29369491622307115, "learning_rate": 1.4743169398907105e-05, "loss": 0.3954, "num_tokens": 1131238509.0, "step": 7470 }, { "epoch": 2.2056653880200647, "grad_norm": 0.2596991316632138, "learning_rate": 1.471584699453552e-05, "loss": 0.4074, "num_tokens": 1132038998.0, "step": 7475 }, { "epoch": 2.2071407494836235, "grad_norm": 0.3137453787793811, "learning_rate": 1.4688524590163935e-05, "loss": 0.4011, "num_tokens": 1132764761.0, "step": 7480 }, { "epoch": 2.2086161109471822, "grad_norm": 0.2920848272463345, "learning_rate": 1.466120218579235e-05, "loss": 0.4006, "num_tokens": 1133501597.0, "step": 7485 }, { "epoch": 2.2100914724107406, "grad_norm": 0.3242045114726045, "learning_rate": 1.4633879781420765e-05, "loss": 0.3938, "num_tokens": 1134248454.0, "step": 7490 }, { "epoch": 2.2115668338742993, "grad_norm": 0.2736015535540063, "learning_rate": 1.4606557377049182e-05, "loss": 0.4172, "num_tokens": 1135082046.0, "step": 7495 }, { "epoch": 2.2130421953378576, "grad_norm": 0.27750491090067453, "learning_rate": 1.4579234972677597e-05, "loss": 0.3986, "num_tokens": 1135868732.0, "step": 7500 }, { "epoch": 2.2145175568014164, "grad_norm": 0.26198859159593724, "learning_rate": 1.4551912568306012e-05, "loss": 0.4171, "num_tokens": 1136653613.0, "step": 7505 }, { "epoch": 2.215992918264975, "grad_norm": 0.28404869469620064, "learning_rate": 1.4524590163934425e-05, "loss": 0.3942, "num_tokens": 1137415125.0, "step": 7510 }, { "epoch": 2.2174682797285334, "grad_norm": 0.30014861936325865, "learning_rate": 1.449726775956284e-05, "loss": 0.3841, "num_tokens": 1138103505.0, "step": 7515 }, { "epoch": 2.218943641192092, "grad_norm": 0.31025948176682966, "learning_rate": 1.4469945355191259e-05, "loss": 0.4076, "num_tokens": 1138858424.0, "step": 7520 }, { "epoch": 2.2204190026556505, "grad_norm": 0.2705334593741179, "learning_rate": 1.4442622950819674e-05, "loss": 0.4042, "num_tokens": 1139627579.0, "step": 7525 }, { "epoch": 2.2218943641192093, "grad_norm": 0.2815768738281566, "learning_rate": 1.4415300546448087e-05, "loss": 0.399, "num_tokens": 1140364493.0, "step": 7530 }, { "epoch": 2.2233697255827676, "grad_norm": 0.26403311477880337, "learning_rate": 1.4387978142076502e-05, "loss": 0.3842, "num_tokens": 1141108344.0, "step": 7535 }, { "epoch": 2.2248450870463263, "grad_norm": 0.26050854687829744, "learning_rate": 1.436065573770492e-05, "loss": 0.3806, "num_tokens": 1141814053.0, "step": 7540 }, { "epoch": 2.226320448509885, "grad_norm": 0.28081784575976654, "learning_rate": 1.4333333333333334e-05, "loss": 0.3891, "num_tokens": 1142592245.0, "step": 7545 }, { "epoch": 2.2277958099734434, "grad_norm": 0.2726817558148972, "learning_rate": 1.4306010928961749e-05, "loss": 0.397, "num_tokens": 1143344135.0, "step": 7550 }, { "epoch": 2.229271171437002, "grad_norm": 0.2819873582953687, "learning_rate": 1.4278688524590164e-05, "loss": 0.4022, "num_tokens": 1144110330.0, "step": 7555 }, { "epoch": 2.2307465329005605, "grad_norm": 0.27424245995734275, "learning_rate": 1.4251366120218579e-05, "loss": 0.3969, "num_tokens": 1144850809.0, "step": 7560 }, { "epoch": 2.232221894364119, "grad_norm": 0.30785665114544203, "learning_rate": 1.4224043715846996e-05, "loss": 0.3794, "num_tokens": 1145551897.0, "step": 7565 }, { "epoch": 2.233697255827678, "grad_norm": 0.28320276265520206, "learning_rate": 1.419672131147541e-05, "loss": 0.3867, "num_tokens": 1146252661.0, "step": 7570 }, { "epoch": 2.2351726172912363, "grad_norm": 0.3049427766211468, "learning_rate": 1.4169398907103826e-05, "loss": 0.3729, "num_tokens": 1146978469.0, "step": 7575 }, { "epoch": 2.236647978754795, "grad_norm": 0.2735546892283133, "learning_rate": 1.414207650273224e-05, "loss": 0.3919, "num_tokens": 1147711618.0, "step": 7580 }, { "epoch": 2.2381233402183534, "grad_norm": 0.27376984092510237, "learning_rate": 1.4114754098360656e-05, "loss": 0.4043, "num_tokens": 1148495237.0, "step": 7585 }, { "epoch": 2.239598701681912, "grad_norm": 0.28017922577522736, "learning_rate": 1.4087431693989072e-05, "loss": 0.3955, "num_tokens": 1149222666.0, "step": 7590 }, { "epoch": 2.2410740631454704, "grad_norm": 0.3159070061569412, "learning_rate": 1.4060109289617487e-05, "loss": 0.3795, "num_tokens": 1149987173.0, "step": 7595 }, { "epoch": 2.242549424609029, "grad_norm": 0.2793518976099039, "learning_rate": 1.4032786885245902e-05, "loss": 0.4079, "num_tokens": 1150770092.0, "step": 7600 }, { "epoch": 2.244024786072588, "grad_norm": 0.2882493085520258, "learning_rate": 1.4005464480874317e-05, "loss": 0.3913, "num_tokens": 1151515507.0, "step": 7605 }, { "epoch": 2.2455001475361462, "grad_norm": 0.3061903318221156, "learning_rate": 1.397814207650273e-05, "loss": 0.3985, "num_tokens": 1152273992.0, "step": 7610 }, { "epoch": 2.246975508999705, "grad_norm": 0.2584622677698678, "learning_rate": 1.395081967213115e-05, "loss": 0.4009, "num_tokens": 1153033639.0, "step": 7615 }, { "epoch": 2.2484508704632633, "grad_norm": 0.27468261440655256, "learning_rate": 1.3923497267759564e-05, "loss": 0.4003, "num_tokens": 1153752832.0, "step": 7620 }, { "epoch": 2.249926231926822, "grad_norm": 0.2517068744363656, "learning_rate": 1.389617486338798e-05, "loss": 0.4096, "num_tokens": 1154584309.0, "step": 7625 }, { "epoch": 2.251401593390381, "grad_norm": 0.2889678465090989, "learning_rate": 1.3868852459016393e-05, "loss": 0.4036, "num_tokens": 1155325187.0, "step": 7630 }, { "epoch": 2.252876954853939, "grad_norm": 0.2910627536297895, "learning_rate": 1.3841530054644811e-05, "loss": 0.392, "num_tokens": 1156051244.0, "step": 7635 }, { "epoch": 2.254352316317498, "grad_norm": 0.2980101378781006, "learning_rate": 1.3814207650273226e-05, "loss": 0.3894, "num_tokens": 1156736038.0, "step": 7640 }, { "epoch": 2.255827677781056, "grad_norm": 0.30505216008618474, "learning_rate": 1.378688524590164e-05, "loss": 0.3934, "num_tokens": 1157467902.0, "step": 7645 }, { "epoch": 2.257303039244615, "grad_norm": 0.2813992291024792, "learning_rate": 1.3759562841530054e-05, "loss": 0.3902, "num_tokens": 1158184261.0, "step": 7650 }, { "epoch": 2.2587784007081737, "grad_norm": 0.2707765232733498, "learning_rate": 1.373224043715847e-05, "loss": 0.3854, "num_tokens": 1158941579.0, "step": 7655 }, { "epoch": 2.260253762171732, "grad_norm": 0.2651774110481582, "learning_rate": 1.3704918032786888e-05, "loss": 0.4111, "num_tokens": 1159756323.0, "step": 7660 }, { "epoch": 2.2617291236352908, "grad_norm": 0.30028315124430716, "learning_rate": 1.3677595628415301e-05, "loss": 0.3935, "num_tokens": 1160462557.0, "step": 7665 }, { "epoch": 2.263204485098849, "grad_norm": 0.28584939242266133, "learning_rate": 1.3650273224043716e-05, "loss": 0.3717, "num_tokens": 1161165431.0, "step": 7670 }, { "epoch": 2.264679846562408, "grad_norm": 0.2807954712981992, "learning_rate": 1.3622950819672131e-05, "loss": 0.4048, "num_tokens": 1161884931.0, "step": 7675 }, { "epoch": 2.2661552080259666, "grad_norm": 0.2618287164754355, "learning_rate": 1.3595628415300546e-05, "loss": 0.3985, "num_tokens": 1162682112.0, "step": 7680 }, { "epoch": 2.267630569489525, "grad_norm": 0.28431930517689785, "learning_rate": 1.3568306010928963e-05, "loss": 0.3865, "num_tokens": 1163419590.0, "step": 7685 }, { "epoch": 2.2691059309530837, "grad_norm": 0.25244095456509136, "learning_rate": 1.3540983606557378e-05, "loss": 0.3816, "num_tokens": 1164181310.0, "step": 7690 }, { "epoch": 2.270581292416642, "grad_norm": 0.29134379664599475, "learning_rate": 1.3513661202185793e-05, "loss": 0.4123, "num_tokens": 1164934712.0, "step": 7695 }, { "epoch": 2.2720566538802007, "grad_norm": 0.2818218786105173, "learning_rate": 1.3486338797814208e-05, "loss": 0.3999, "num_tokens": 1165701101.0, "step": 7700 }, { "epoch": 2.273532015343759, "grad_norm": 0.2607788554442979, "learning_rate": 1.3459016393442623e-05, "loss": 0.3858, "num_tokens": 1166427733.0, "step": 7705 }, { "epoch": 2.275007376807318, "grad_norm": 0.28302570817529016, "learning_rate": 1.343169398907104e-05, "loss": 0.428, "num_tokens": 1167245803.0, "step": 7710 }, { "epoch": 2.2764827382708765, "grad_norm": 0.2772630224882978, "learning_rate": 1.3404371584699455e-05, "loss": 0.4092, "num_tokens": 1168038975.0, "step": 7715 }, { "epoch": 2.277958099734435, "grad_norm": 0.28242240141256314, "learning_rate": 1.337704918032787e-05, "loss": 0.3822, "num_tokens": 1168721656.0, "step": 7720 }, { "epoch": 2.2794334611979936, "grad_norm": 0.2667888432336588, "learning_rate": 1.3349726775956285e-05, "loss": 0.4083, "num_tokens": 1169532183.0, "step": 7725 }, { "epoch": 2.280908822661552, "grad_norm": 0.2889479633934226, "learning_rate": 1.3322404371584701e-05, "loss": 0.3818, "num_tokens": 1170275690.0, "step": 7730 }, { "epoch": 2.2823841841251107, "grad_norm": 0.2921856626129041, "learning_rate": 1.3295081967213116e-05, "loss": 0.4138, "num_tokens": 1171043123.0, "step": 7735 }, { "epoch": 2.283859545588669, "grad_norm": 0.2759255031130324, "learning_rate": 1.3267759562841531e-05, "loss": 0.4011, "num_tokens": 1171813379.0, "step": 7740 }, { "epoch": 2.2853349070522277, "grad_norm": 0.29512911650700224, "learning_rate": 1.3240437158469945e-05, "loss": 0.3934, "num_tokens": 1172614869.0, "step": 7745 }, { "epoch": 2.2868102685157865, "grad_norm": 0.2581546401403993, "learning_rate": 1.321311475409836e-05, "loss": 0.3977, "num_tokens": 1173437813.0, "step": 7750 }, { "epoch": 2.288285629979345, "grad_norm": 0.3033096560399989, "learning_rate": 1.3185792349726778e-05, "loss": 0.3894, "num_tokens": 1174142950.0, "step": 7755 }, { "epoch": 2.2897609914429036, "grad_norm": 0.2528517151928355, "learning_rate": 1.3158469945355193e-05, "loss": 0.4125, "num_tokens": 1174957902.0, "step": 7760 }, { "epoch": 2.291236352906462, "grad_norm": 0.2699801648722403, "learning_rate": 1.3131147540983606e-05, "loss": 0.3971, "num_tokens": 1175680816.0, "step": 7765 }, { "epoch": 2.2927117143700206, "grad_norm": 0.2966264518439965, "learning_rate": 1.3103825136612021e-05, "loss": 0.4034, "num_tokens": 1176466797.0, "step": 7770 }, { "epoch": 2.2941870758335794, "grad_norm": 0.2720461344041807, "learning_rate": 1.3076502732240436e-05, "loss": 0.3859, "num_tokens": 1177219584.0, "step": 7775 }, { "epoch": 2.2956624372971377, "grad_norm": 0.25257936793121477, "learning_rate": 1.3049180327868853e-05, "loss": 0.394, "num_tokens": 1178009219.0, "step": 7780 }, { "epoch": 2.2971377987606965, "grad_norm": 0.26442351204345843, "learning_rate": 1.3021857923497268e-05, "loss": 0.3881, "num_tokens": 1178773469.0, "step": 7785 }, { "epoch": 2.2986131602242548, "grad_norm": 0.26373489038856857, "learning_rate": 1.2994535519125683e-05, "loss": 0.4132, "num_tokens": 1179574859.0, "step": 7790 }, { "epoch": 2.3000885216878135, "grad_norm": 0.2820773187655366, "learning_rate": 1.2967213114754098e-05, "loss": 0.3967, "num_tokens": 1180318514.0, "step": 7795 }, { "epoch": 2.3015638831513723, "grad_norm": 0.2607443129332514, "learning_rate": 1.2939890710382515e-05, "loss": 0.4081, "num_tokens": 1181106037.0, "step": 7800 }, { "epoch": 2.3030392446149306, "grad_norm": 0.30201001748310874, "learning_rate": 1.291256830601093e-05, "loss": 0.4138, "num_tokens": 1181869842.0, "step": 7805 }, { "epoch": 2.3045146060784893, "grad_norm": 0.2844501509007233, "learning_rate": 1.2885245901639345e-05, "loss": 0.3921, "num_tokens": 1182618083.0, "step": 7810 }, { "epoch": 2.3059899675420477, "grad_norm": 0.2834774121024499, "learning_rate": 1.285792349726776e-05, "loss": 0.402, "num_tokens": 1183375350.0, "step": 7815 }, { "epoch": 2.3074653290056064, "grad_norm": 0.2966474001344498, "learning_rate": 1.2830601092896175e-05, "loss": 0.3915, "num_tokens": 1184080260.0, "step": 7820 }, { "epoch": 2.308940690469165, "grad_norm": 0.25197158992528806, "learning_rate": 1.2803278688524592e-05, "loss": 0.4054, "num_tokens": 1184903032.0, "step": 7825 }, { "epoch": 2.3104160519327235, "grad_norm": 0.26674019275726285, "learning_rate": 1.2775956284153007e-05, "loss": 0.3878, "num_tokens": 1185661424.0, "step": 7830 }, { "epoch": 2.3118914133962822, "grad_norm": 0.26141274736696357, "learning_rate": 1.2748633879781422e-05, "loss": 0.4011, "num_tokens": 1186449868.0, "step": 7835 }, { "epoch": 2.3133667748598405, "grad_norm": 0.2555310069843455, "learning_rate": 1.2721311475409837e-05, "loss": 0.4073, "num_tokens": 1187280775.0, "step": 7840 }, { "epoch": 2.3148421363233993, "grad_norm": 0.2530867880903473, "learning_rate": 1.269398907103825e-05, "loss": 0.3944, "num_tokens": 1188055625.0, "step": 7845 }, { "epoch": 2.3163174977869576, "grad_norm": 0.287616689841404, "learning_rate": 1.2666666666666668e-05, "loss": 0.3946, "num_tokens": 1188805109.0, "step": 7850 }, { "epoch": 2.3177928592505164, "grad_norm": 0.26570221581832393, "learning_rate": 1.2639344262295084e-05, "loss": 0.386, "num_tokens": 1189581847.0, "step": 7855 }, { "epoch": 2.319268220714075, "grad_norm": 0.26297563023560866, "learning_rate": 1.2612021857923499e-05, "loss": 0.3896, "num_tokens": 1190330829.0, "step": 7860 }, { "epoch": 2.3207435821776334, "grad_norm": 0.24789291920328124, "learning_rate": 1.2584699453551912e-05, "loss": 0.4033, "num_tokens": 1191155449.0, "step": 7865 }, { "epoch": 2.322218943641192, "grad_norm": 0.2623700540879629, "learning_rate": 1.2557377049180327e-05, "loss": 0.3915, "num_tokens": 1191880865.0, "step": 7870 }, { "epoch": 2.3236943051047505, "grad_norm": 0.26129323711557, "learning_rate": 1.2530054644808745e-05, "loss": 0.4073, "num_tokens": 1192685047.0, "step": 7875 }, { "epoch": 2.3251696665683093, "grad_norm": 0.2874480513490268, "learning_rate": 1.2502732240437159e-05, "loss": 0.4, "num_tokens": 1193444474.0, "step": 7880 }, { "epoch": 2.3266450280318676, "grad_norm": 0.29645736088151703, "learning_rate": 1.2475409836065574e-05, "loss": 0.3916, "num_tokens": 1194193520.0, "step": 7885 }, { "epoch": 2.3281203894954263, "grad_norm": 0.2664271552073131, "learning_rate": 1.244808743169399e-05, "loss": 0.385, "num_tokens": 1194891220.0, "step": 7890 }, { "epoch": 2.329595750958985, "grad_norm": 0.275895337822949, "learning_rate": 1.2420765027322405e-05, "loss": 0.4078, "num_tokens": 1195689533.0, "step": 7895 }, { "epoch": 2.3310711124225434, "grad_norm": 0.2867309130653135, "learning_rate": 1.239344262295082e-05, "loss": 0.3918, "num_tokens": 1196451282.0, "step": 7900 }, { "epoch": 2.332546473886102, "grad_norm": 0.2689624915862047, "learning_rate": 1.2366120218579235e-05, "loss": 0.4048, "num_tokens": 1197254822.0, "step": 7905 }, { "epoch": 2.3340218353496605, "grad_norm": 0.2764968968208129, "learning_rate": 1.233879781420765e-05, "loss": 0.384, "num_tokens": 1197989467.0, "step": 7910 }, { "epoch": 2.335497196813219, "grad_norm": 0.26735752436912413, "learning_rate": 1.2311475409836067e-05, "loss": 0.3921, "num_tokens": 1198731901.0, "step": 7915 }, { "epoch": 2.336972558276778, "grad_norm": 0.2865982235227927, "learning_rate": 1.2284153005464482e-05, "loss": 0.3703, "num_tokens": 1199454560.0, "step": 7920 }, { "epoch": 2.3384479197403363, "grad_norm": 0.24986494282091412, "learning_rate": 1.2256830601092897e-05, "loss": 0.4022, "num_tokens": 1200285684.0, "step": 7925 }, { "epoch": 2.339923281203895, "grad_norm": 0.26750178643231515, "learning_rate": 1.2229508196721312e-05, "loss": 0.3883, "num_tokens": 1201073914.0, "step": 7930 }, { "epoch": 2.3413986426674533, "grad_norm": 0.28233872917784625, "learning_rate": 1.2202185792349727e-05, "loss": 0.4015, "num_tokens": 1201834356.0, "step": 7935 }, { "epoch": 2.342874004131012, "grad_norm": 0.30629253220293007, "learning_rate": 1.2174863387978142e-05, "loss": 0.392, "num_tokens": 1202567225.0, "step": 7940 }, { "epoch": 2.344349365594571, "grad_norm": 0.2622723527582185, "learning_rate": 1.2147540983606557e-05, "loss": 0.3889, "num_tokens": 1203333879.0, "step": 7945 }, { "epoch": 2.345824727058129, "grad_norm": 0.2588751675210825, "learning_rate": 1.2120218579234974e-05, "loss": 0.3923, "num_tokens": 1204079834.0, "step": 7950 }, { "epoch": 2.347300088521688, "grad_norm": 0.259949436121917, "learning_rate": 1.2092896174863389e-05, "loss": 0.3832, "num_tokens": 1204807656.0, "step": 7955 }, { "epoch": 2.3487754499852462, "grad_norm": 0.26637269732551405, "learning_rate": 1.2065573770491804e-05, "loss": 0.3989, "num_tokens": 1205564519.0, "step": 7960 }, { "epoch": 2.350250811448805, "grad_norm": 0.2908438825751387, "learning_rate": 1.2038251366120219e-05, "loss": 0.3968, "num_tokens": 1206297751.0, "step": 7965 }, { "epoch": 2.3517261729123637, "grad_norm": 0.25340212101642307, "learning_rate": 1.2010928961748634e-05, "loss": 0.397, "num_tokens": 1207060649.0, "step": 7970 }, { "epoch": 2.353201534375922, "grad_norm": 0.2561764920381203, "learning_rate": 1.198360655737705e-05, "loss": 0.4086, "num_tokens": 1207844798.0, "step": 7975 }, { "epoch": 2.354676895839481, "grad_norm": 0.27336580124589366, "learning_rate": 1.1956284153005464e-05, "loss": 0.3902, "num_tokens": 1208594021.0, "step": 7980 }, { "epoch": 2.356152257303039, "grad_norm": 0.2570258950676296, "learning_rate": 1.192896174863388e-05, "loss": 0.4019, "num_tokens": 1209381291.0, "step": 7985 }, { "epoch": 2.357627618766598, "grad_norm": 0.2687654029922856, "learning_rate": 1.1901639344262296e-05, "loss": 0.3934, "num_tokens": 1210184055.0, "step": 7990 }, { "epoch": 2.3591029802301566, "grad_norm": 0.2735474199036968, "learning_rate": 1.187431693989071e-05, "loss": 0.3961, "num_tokens": 1210917068.0, "step": 7995 }, { "epoch": 2.360578341693715, "grad_norm": 0.2699253587307501, "learning_rate": 1.1846994535519126e-05, "loss": 0.3802, "num_tokens": 1211651980.0, "step": 8000 }, { "epoch": 2.3620537031572737, "grad_norm": 0.2979961200400008, "learning_rate": 1.181967213114754e-05, "loss": 0.3824, "num_tokens": 1212393621.0, "step": 8005 }, { "epoch": 2.363529064620832, "grad_norm": 0.27443025599586485, "learning_rate": 1.1792349726775957e-05, "loss": 0.3836, "num_tokens": 1213133368.0, "step": 8010 }, { "epoch": 2.3650044260843908, "grad_norm": 0.2897909316033054, "learning_rate": 1.1765027322404372e-05, "loss": 0.3782, "num_tokens": 1213845850.0, "step": 8015 }, { "epoch": 2.366479787547949, "grad_norm": 0.2795572377849176, "learning_rate": 1.1737704918032788e-05, "loss": 0.3965, "num_tokens": 1214637073.0, "step": 8020 }, { "epoch": 2.367955149011508, "grad_norm": 0.3010359412128409, "learning_rate": 1.1710382513661203e-05, "loss": 0.3818, "num_tokens": 1215351353.0, "step": 8025 }, { "epoch": 2.369430510475066, "grad_norm": 0.2940112933009273, "learning_rate": 1.1683060109289618e-05, "loss": 0.3877, "num_tokens": 1216104019.0, "step": 8030 }, { "epoch": 2.370905871938625, "grad_norm": 0.2822167999717945, "learning_rate": 1.1655737704918034e-05, "loss": 0.3936, "num_tokens": 1216860539.0, "step": 8035 }, { "epoch": 2.3723812334021837, "grad_norm": 0.273526110368533, "learning_rate": 1.1628415300546448e-05, "loss": 0.3777, "num_tokens": 1217587559.0, "step": 8040 }, { "epoch": 2.373856594865742, "grad_norm": 0.28221686183234485, "learning_rate": 1.1601092896174864e-05, "loss": 0.4015, "num_tokens": 1218357726.0, "step": 8045 }, { "epoch": 2.3753319563293007, "grad_norm": 0.2632494862458821, "learning_rate": 1.157377049180328e-05, "loss": 0.3923, "num_tokens": 1219073416.0, "step": 8050 }, { "epoch": 2.376807317792859, "grad_norm": 0.28160897415221275, "learning_rate": 1.1546448087431696e-05, "loss": 0.3951, "num_tokens": 1219855373.0, "step": 8055 }, { "epoch": 2.378282679256418, "grad_norm": 0.27678068694069413, "learning_rate": 1.151912568306011e-05, "loss": 0.3939, "num_tokens": 1220640929.0, "step": 8060 }, { "epoch": 2.3797580407199765, "grad_norm": 0.2703641016314514, "learning_rate": 1.1491803278688524e-05, "loss": 0.392, "num_tokens": 1221382323.0, "step": 8065 }, { "epoch": 2.381233402183535, "grad_norm": 0.2814652755495772, "learning_rate": 1.1464480874316941e-05, "loss": 0.392, "num_tokens": 1222155078.0, "step": 8070 }, { "epoch": 2.3827087636470936, "grad_norm": 0.2713553200769635, "learning_rate": 1.1437158469945356e-05, "loss": 0.3921, "num_tokens": 1222927211.0, "step": 8075 }, { "epoch": 2.384184125110652, "grad_norm": 0.31078249661842217, "learning_rate": 1.1409836065573771e-05, "loss": 0.3865, "num_tokens": 1223683097.0, "step": 8080 }, { "epoch": 2.3856594865742107, "grad_norm": 0.2838731445255209, "learning_rate": 1.1382513661202186e-05, "loss": 0.3978, "num_tokens": 1224441579.0, "step": 8085 }, { "epoch": 2.3871348480377694, "grad_norm": 0.25695712041859403, "learning_rate": 1.1355191256830601e-05, "loss": 0.3942, "num_tokens": 1225215221.0, "step": 8090 }, { "epoch": 2.3886102095013277, "grad_norm": 0.2830223848071753, "learning_rate": 1.1327868852459018e-05, "loss": 0.3858, "num_tokens": 1225990283.0, "step": 8095 }, { "epoch": 2.3900855709648865, "grad_norm": 0.2681970256088667, "learning_rate": 1.1300546448087431e-05, "loss": 0.394, "num_tokens": 1226751264.0, "step": 8100 }, { "epoch": 2.391560932428445, "grad_norm": 0.2619174588057909, "learning_rate": 1.1273224043715848e-05, "loss": 0.3837, "num_tokens": 1227480665.0, "step": 8105 }, { "epoch": 2.3930362938920036, "grad_norm": 0.2675241241687795, "learning_rate": 1.1245901639344263e-05, "loss": 0.3807, "num_tokens": 1228240015.0, "step": 8110 }, { "epoch": 2.3945116553555623, "grad_norm": 0.29326474514444956, "learning_rate": 1.1218579234972678e-05, "loss": 0.3957, "num_tokens": 1228927587.0, "step": 8115 }, { "epoch": 2.3959870168191206, "grad_norm": 0.26388170760190516, "learning_rate": 1.1191256830601093e-05, "loss": 0.3936, "num_tokens": 1229702836.0, "step": 8120 }, { "epoch": 2.3974623782826794, "grad_norm": 0.2780155577831509, "learning_rate": 1.1163934426229508e-05, "loss": 0.4019, "num_tokens": 1230442537.0, "step": 8125 }, { "epoch": 2.3989377397462377, "grad_norm": 0.2627606611223777, "learning_rate": 1.1136612021857925e-05, "loss": 0.3829, "num_tokens": 1231197719.0, "step": 8130 }, { "epoch": 2.4004131012097965, "grad_norm": 0.27609346561476994, "learning_rate": 1.110928961748634e-05, "loss": 0.4048, "num_tokens": 1231957527.0, "step": 8135 }, { "epoch": 2.401888462673355, "grad_norm": 0.2573535733133017, "learning_rate": 1.1081967213114755e-05, "loss": 0.3904, "num_tokens": 1232714623.0, "step": 8140 }, { "epoch": 2.4033638241369135, "grad_norm": 0.2786969950283373, "learning_rate": 1.105464480874317e-05, "loss": 0.3976, "num_tokens": 1233480026.0, "step": 8145 }, { "epoch": 2.4048391856004723, "grad_norm": 0.2798224023116836, "learning_rate": 1.1027322404371586e-05, "loss": 0.3949, "num_tokens": 1234225668.0, "step": 8150 }, { "epoch": 2.4063145470640306, "grad_norm": 0.277466643396241, "learning_rate": 1.1000000000000001e-05, "loss": 0.4014, "num_tokens": 1234975788.0, "step": 8155 }, { "epoch": 2.4077899085275893, "grad_norm": 0.281542763792784, "learning_rate": 1.0972677595628415e-05, "loss": 0.4047, "num_tokens": 1235806233.0, "step": 8160 }, { "epoch": 2.4092652699911477, "grad_norm": 0.2835659708856736, "learning_rate": 1.0945355191256831e-05, "loss": 0.3823, "num_tokens": 1236527683.0, "step": 8165 }, { "epoch": 2.4107406314547064, "grad_norm": 0.308585385202604, "learning_rate": 1.0918032786885246e-05, "loss": 0.3816, "num_tokens": 1237231069.0, "step": 8170 }, { "epoch": 2.412215992918265, "grad_norm": 0.2669585961088539, "learning_rate": 1.0890710382513661e-05, "loss": 0.4101, "num_tokens": 1238083468.0, "step": 8175 }, { "epoch": 2.4136913543818235, "grad_norm": 0.2878277596961725, "learning_rate": 1.0863387978142076e-05, "loss": 0.3946, "num_tokens": 1238854290.0, "step": 8180 }, { "epoch": 2.4151667158453822, "grad_norm": 0.32259547614443845, "learning_rate": 1.0836065573770492e-05, "loss": 0.3963, "num_tokens": 1239632987.0, "step": 8185 }, { "epoch": 2.4166420773089405, "grad_norm": 0.2747947154462544, "learning_rate": 1.0808743169398908e-05, "loss": 0.3903, "num_tokens": 1240363305.0, "step": 8190 }, { "epoch": 2.4181174387724993, "grad_norm": 0.2783966447188167, "learning_rate": 1.0781420765027323e-05, "loss": 0.4089, "num_tokens": 1241145729.0, "step": 8195 }, { "epoch": 2.4195928002360576, "grad_norm": 0.24721589304440192, "learning_rate": 1.0754098360655738e-05, "loss": 0.3951, "num_tokens": 1241901886.0, "step": 8200 }, { "epoch": 2.4210681616996164, "grad_norm": 0.3173642672524503, "learning_rate": 1.0726775956284153e-05, "loss": 0.3902, "num_tokens": 1242652253.0, "step": 8205 }, { "epoch": 2.422543523163175, "grad_norm": 0.29438224621337505, "learning_rate": 1.069945355191257e-05, "loss": 0.4035, "num_tokens": 1243372270.0, "step": 8210 }, { "epoch": 2.4240188846267334, "grad_norm": 0.2875307683327725, "learning_rate": 1.0672131147540983e-05, "loss": 0.4009, "num_tokens": 1244122950.0, "step": 8215 }, { "epoch": 2.425494246090292, "grad_norm": 0.27785312712573107, "learning_rate": 1.0644808743169398e-05, "loss": 0.4003, "num_tokens": 1244891058.0, "step": 8220 }, { "epoch": 2.4269696075538505, "grad_norm": 0.28744671716432973, "learning_rate": 1.0617486338797815e-05, "loss": 0.3889, "num_tokens": 1245677125.0, "step": 8225 }, { "epoch": 2.4284449690174092, "grad_norm": 0.26529264963919497, "learning_rate": 1.059016393442623e-05, "loss": 0.3902, "num_tokens": 1246434917.0, "step": 8230 }, { "epoch": 2.429920330480968, "grad_norm": 0.24626596270349596, "learning_rate": 1.0562841530054645e-05, "loss": 0.3793, "num_tokens": 1247194514.0, "step": 8235 }, { "epoch": 2.4313956919445263, "grad_norm": 0.26855393314627796, "learning_rate": 1.053551912568306e-05, "loss": 0.3941, "num_tokens": 1247951044.0, "step": 8240 }, { "epoch": 2.432871053408085, "grad_norm": 0.31635313756795824, "learning_rate": 1.0508196721311477e-05, "loss": 0.4024, "num_tokens": 1248698584.0, "step": 8245 }, { "epoch": 2.4343464148716434, "grad_norm": 0.24402421347213674, "learning_rate": 1.0480874316939892e-05, "loss": 0.384, "num_tokens": 1249465102.0, "step": 8250 }, { "epoch": 2.435821776335202, "grad_norm": 0.2530467145984236, "learning_rate": 1.0453551912568305e-05, "loss": 0.404, "num_tokens": 1250248532.0, "step": 8255 }, { "epoch": 2.437297137798761, "grad_norm": 0.26695829133381566, "learning_rate": 1.0426229508196722e-05, "loss": 0.4084, "num_tokens": 1250996965.0, "step": 8260 }, { "epoch": 2.438772499262319, "grad_norm": 0.26813323801628947, "learning_rate": 1.0398907103825137e-05, "loss": 0.374, "num_tokens": 1251749374.0, "step": 8265 }, { "epoch": 2.440247860725878, "grad_norm": 0.25352663619687565, "learning_rate": 1.0371584699453554e-05, "loss": 0.3801, "num_tokens": 1252522069.0, "step": 8270 }, { "epoch": 2.4417232221894363, "grad_norm": 0.26113940037745675, "learning_rate": 1.0344262295081967e-05, "loss": 0.3969, "num_tokens": 1253307302.0, "step": 8275 }, { "epoch": 2.443198583652995, "grad_norm": 0.26194057475289484, "learning_rate": 1.0316939890710384e-05, "loss": 0.3843, "num_tokens": 1254057581.0, "step": 8280 }, { "epoch": 2.444673945116554, "grad_norm": 0.27218274785639346, "learning_rate": 1.0289617486338799e-05, "loss": 0.4005, "num_tokens": 1254827711.0, "step": 8285 }, { "epoch": 2.446149306580112, "grad_norm": 0.26665966655087214, "learning_rate": 1.0262295081967214e-05, "loss": 0.387, "num_tokens": 1255619186.0, "step": 8290 }, { "epoch": 2.447624668043671, "grad_norm": 0.290230159651935, "learning_rate": 1.0234972677595629e-05, "loss": 0.3691, "num_tokens": 1256295031.0, "step": 8295 }, { "epoch": 2.449100029507229, "grad_norm": 0.27603186995827833, "learning_rate": 1.0207650273224044e-05, "loss": 0.4, "num_tokens": 1257087296.0, "step": 8300 }, { "epoch": 2.450575390970788, "grad_norm": 0.28012240173469805, "learning_rate": 1.018032786885246e-05, "loss": 0.39, "num_tokens": 1257809241.0, "step": 8305 }, { "epoch": 2.4520507524343462, "grad_norm": 0.29115451428245415, "learning_rate": 1.0153005464480875e-05, "loss": 0.3884, "num_tokens": 1258538703.0, "step": 8310 }, { "epoch": 2.453526113897905, "grad_norm": 0.2836070771297043, "learning_rate": 1.0125683060109289e-05, "loss": 0.3872, "num_tokens": 1259302251.0, "step": 8315 }, { "epoch": 2.4550014753614637, "grad_norm": 0.2682488773289176, "learning_rate": 1.0098360655737705e-05, "loss": 0.3904, "num_tokens": 1260077146.0, "step": 8320 }, { "epoch": 2.456476836825022, "grad_norm": 0.30503714879316574, "learning_rate": 1.007103825136612e-05, "loss": 0.3726, "num_tokens": 1260752842.0, "step": 8325 }, { "epoch": 2.457952198288581, "grad_norm": 0.27084144529230364, "learning_rate": 1.0043715846994537e-05, "loss": 0.3912, "num_tokens": 1261551973.0, "step": 8330 }, { "epoch": 2.459427559752139, "grad_norm": 0.27157142652979827, "learning_rate": 1.001639344262295e-05, "loss": 0.4021, "num_tokens": 1262338621.0, "step": 8335 }, { "epoch": 2.460902921215698, "grad_norm": 0.2603245021918322, "learning_rate": 9.989071038251367e-06, "loss": 0.3989, "num_tokens": 1263140896.0, "step": 8340 }, { "epoch": 2.462378282679256, "grad_norm": 0.29866399138140365, "learning_rate": 9.961748633879782e-06, "loss": 0.4085, "num_tokens": 1263933382.0, "step": 8345 }, { "epoch": 2.463853644142815, "grad_norm": 0.29104368188650687, "learning_rate": 9.934426229508197e-06, "loss": 0.3967, "num_tokens": 1264637755.0, "step": 8350 }, { "epoch": 2.4653290056063737, "grad_norm": 0.32044860285525084, "learning_rate": 9.907103825136612e-06, "loss": 0.4002, "num_tokens": 1265340529.0, "step": 8355 }, { "epoch": 2.466804367069932, "grad_norm": 0.27152875913889196, "learning_rate": 9.879781420765027e-06, "loss": 0.3811, "num_tokens": 1266092836.0, "step": 8360 }, { "epoch": 2.4682797285334908, "grad_norm": 0.25590473043533823, "learning_rate": 9.852459016393444e-06, "loss": 0.4017, "num_tokens": 1266882805.0, "step": 8365 }, { "epoch": 2.469755089997049, "grad_norm": 0.2732030572130765, "learning_rate": 9.825136612021859e-06, "loss": 0.3982, "num_tokens": 1267656168.0, "step": 8370 }, { "epoch": 2.471230451460608, "grad_norm": 0.2829393915319978, "learning_rate": 9.797814207650274e-06, "loss": 0.3882, "num_tokens": 1268411299.0, "step": 8375 }, { "epoch": 2.4727058129241666, "grad_norm": 0.29988813853610435, "learning_rate": 9.770491803278689e-06, "loss": 0.4163, "num_tokens": 1269204814.0, "step": 8380 }, { "epoch": 2.474181174387725, "grad_norm": 0.27713886234976515, "learning_rate": 9.743169398907104e-06, "loss": 0.384, "num_tokens": 1269907395.0, "step": 8385 }, { "epoch": 2.4756565358512836, "grad_norm": 0.25476861255909444, "learning_rate": 9.71584699453552e-06, "loss": 0.4046, "num_tokens": 1270704398.0, "step": 8390 }, { "epoch": 2.477131897314842, "grad_norm": 0.2791081773025259, "learning_rate": 9.688524590163934e-06, "loss": 0.3949, "num_tokens": 1271451710.0, "step": 8395 }, { "epoch": 2.4786072587784007, "grad_norm": 0.28870702457479636, "learning_rate": 9.66120218579235e-06, "loss": 0.4018, "num_tokens": 1272217339.0, "step": 8400 }, { "epoch": 2.4800826202419595, "grad_norm": 0.2689289290378706, "learning_rate": 9.633879781420766e-06, "loss": 0.3865, "num_tokens": 1272967902.0, "step": 8405 }, { "epoch": 2.481557981705518, "grad_norm": 0.3103494009326824, "learning_rate": 9.60655737704918e-06, "loss": 0.3737, "num_tokens": 1273634127.0, "step": 8410 }, { "epoch": 2.4830333431690765, "grad_norm": 0.2706617034775439, "learning_rate": 9.579234972677596e-06, "loss": 0.4023, "num_tokens": 1274350368.0, "step": 8415 }, { "epoch": 2.484508704632635, "grad_norm": 0.27973639744145823, "learning_rate": 9.55191256830601e-06, "loss": 0.3957, "num_tokens": 1275087175.0, "step": 8420 }, { "epoch": 2.4859840660961936, "grad_norm": 0.27414022631756263, "learning_rate": 9.524590163934428e-06, "loss": 0.3975, "num_tokens": 1275865708.0, "step": 8425 }, { "epoch": 2.4874594275597524, "grad_norm": 0.2706352174174677, "learning_rate": 9.497267759562843e-06, "loss": 0.3941, "num_tokens": 1276659620.0, "step": 8430 }, { "epoch": 2.4889347890233107, "grad_norm": 0.26881442040040054, "learning_rate": 9.469945355191258e-06, "loss": 0.3911, "num_tokens": 1277437999.0, "step": 8435 }, { "epoch": 2.4904101504868694, "grad_norm": 0.2780015700192069, "learning_rate": 9.442622950819673e-06, "loss": 0.3969, "num_tokens": 1278215226.0, "step": 8440 }, { "epoch": 2.4918855119504277, "grad_norm": 0.30394467493283567, "learning_rate": 9.415300546448088e-06, "loss": 0.3797, "num_tokens": 1278930425.0, "step": 8445 }, { "epoch": 2.4933608734139865, "grad_norm": 0.2701948630740562, "learning_rate": 9.387978142076503e-06, "loss": 0.4051, "num_tokens": 1279692182.0, "step": 8450 }, { "epoch": 2.4948362348775452, "grad_norm": 0.25825072663963705, "learning_rate": 9.360655737704918e-06, "loss": 0.4085, "num_tokens": 1280459343.0, "step": 8455 }, { "epoch": 2.4963115963411036, "grad_norm": 0.2981352792726616, "learning_rate": 9.333333333333334e-06, "loss": 0.3919, "num_tokens": 1281237547.0, "step": 8460 }, { "epoch": 2.4977869578046623, "grad_norm": 0.28263662633116454, "learning_rate": 9.30601092896175e-06, "loss": 0.3934, "num_tokens": 1282044618.0, "step": 8465 }, { "epoch": 2.4992623192682206, "grad_norm": 0.2759173214872667, "learning_rate": 9.278688524590164e-06, "loss": 0.4032, "num_tokens": 1282792017.0, "step": 8470 }, { "epoch": 2.5007376807317794, "grad_norm": 0.2600344733542924, "learning_rate": 9.25136612021858e-06, "loss": 0.4124, "num_tokens": 1283581441.0, "step": 8475 }, { "epoch": 2.502213042195338, "grad_norm": 0.2651004293214792, "learning_rate": 9.224043715846994e-06, "loss": 0.3838, "num_tokens": 1284309936.0, "step": 8480 }, { "epoch": 2.5036884036588964, "grad_norm": 0.3054783498215468, "learning_rate": 9.196721311475411e-06, "loss": 0.3752, "num_tokens": 1285014533.0, "step": 8485 }, { "epoch": 2.5051637651224548, "grad_norm": 0.2713273504057507, "learning_rate": 9.169398907103824e-06, "loss": 0.3831, "num_tokens": 1285777832.0, "step": 8490 }, { "epoch": 2.5066391265860135, "grad_norm": 0.2854630899870263, "learning_rate": 9.142076502732241e-06, "loss": 0.3919, "num_tokens": 1286529796.0, "step": 8495 }, { "epoch": 2.5081144880495723, "grad_norm": 0.2736115550258984, "learning_rate": 9.114754098360656e-06, "loss": 0.3828, "num_tokens": 1287216946.0, "step": 8500 }, { "epoch": 2.5095898495131306, "grad_norm": 0.2957173356573747, "learning_rate": 9.087431693989071e-06, "loss": 0.3807, "num_tokens": 694631.0, "step": 8505 }, { "epoch": 2.5110652109766893, "grad_norm": 0.2895837250503884, "learning_rate": 9.060109289617486e-06, "loss": 0.3861, "num_tokens": 1425374.0, "step": 8510 }, { "epoch": 2.5125405724402476, "grad_norm": 0.27526610596853335, "learning_rate": 9.032786885245901e-06, "loss": 0.4114, "num_tokens": 2188061.0, "step": 8515 }, { "epoch": 2.5140159339038064, "grad_norm": 0.27863021579794195, "learning_rate": 9.005464480874318e-06, "loss": 0.393, "num_tokens": 2923652.0, "step": 8520 }, { "epoch": 2.515491295367365, "grad_norm": 0.25159225333088214, "learning_rate": 8.978142076502733e-06, "loss": 0.3861, "num_tokens": 3737332.0, "step": 8525 }, { "epoch": 2.5169666568309235, "grad_norm": 0.30590966326834673, "learning_rate": 8.950819672131148e-06, "loss": 0.3928, "num_tokens": 4503942.0, "step": 8530 }, { "epoch": 2.518442018294482, "grad_norm": 0.2633338501616148, "learning_rate": 8.923497267759563e-06, "loss": 0.3864, "num_tokens": 5294571.0, "step": 8535 }, { "epoch": 2.5199173797580405, "grad_norm": 0.29128430736490396, "learning_rate": 8.896174863387978e-06, "loss": 0.3763, "num_tokens": 5990757.0, "step": 8540 }, { "epoch": 2.5213927412215993, "grad_norm": 0.3035724272474397, "learning_rate": 8.868852459016395e-06, "loss": 0.3828, "num_tokens": 6734334.0, "step": 8545 }, { "epoch": 2.522868102685158, "grad_norm": 0.300174332415416, "learning_rate": 8.841530054644808e-06, "loss": 0.3972, "num_tokens": 7511559.0, "step": 8550 }, { "epoch": 2.5243434641487164, "grad_norm": 0.2756692639590811, "learning_rate": 8.814207650273225e-06, "loss": 0.3904, "num_tokens": 8279298.0, "step": 8555 }, { "epoch": 2.525818825612275, "grad_norm": 0.32703709787949753, "learning_rate": 8.78688524590164e-06, "loss": 0.403, "num_tokens": 9026065.0, "step": 8560 }, { "epoch": 2.5272941870758334, "grad_norm": 0.2784500493696018, "learning_rate": 8.759562841530056e-06, "loss": 0.4093, "num_tokens": 9865211.0, "step": 8565 }, { "epoch": 2.528769548539392, "grad_norm": 0.2831285222523414, "learning_rate": 8.73224043715847e-06, "loss": 0.4148, "num_tokens": 10677366.0, "step": 8570 }, { "epoch": 2.530244910002951, "grad_norm": 0.30324185450873054, "learning_rate": 8.704918032786885e-06, "loss": 0.419, "num_tokens": 11448563.0, "step": 8575 }, { "epoch": 2.5317202714665092, "grad_norm": 0.28653298290358775, "learning_rate": 8.677595628415301e-06, "loss": 0.3875, "num_tokens": 12198846.0, "step": 8580 }, { "epoch": 2.533195632930068, "grad_norm": 0.33085697661003954, "learning_rate": 8.650273224043716e-06, "loss": 0.3924, "num_tokens": 12888275.0, "step": 8585 }, { "epoch": 2.5346709943936263, "grad_norm": 0.259065208829237, "learning_rate": 8.622950819672132e-06, "loss": 0.3896, "num_tokens": 13644799.0, "step": 8590 }, { "epoch": 2.536146355857185, "grad_norm": 0.26849595670508924, "learning_rate": 8.595628415300547e-06, "loss": 0.409, "num_tokens": 14450070.0, "step": 8595 }, { "epoch": 2.537621717320744, "grad_norm": 0.28530701393159863, "learning_rate": 8.568306010928963e-06, "loss": 0.4064, "num_tokens": 15196041.0, "step": 8600 }, { "epoch": 2.539097078784302, "grad_norm": 0.2714707845920988, "learning_rate": 8.540983606557378e-06, "loss": 0.4019, "num_tokens": 15955512.0, "step": 8605 }, { "epoch": 2.540572440247861, "grad_norm": 0.2800127319903426, "learning_rate": 8.513661202185792e-06, "loss": 0.3913, "num_tokens": 16723599.0, "step": 8610 }, { "epoch": 2.542047801711419, "grad_norm": 0.2609532075296624, "learning_rate": 8.486338797814208e-06, "loss": 0.3754, "num_tokens": 17414908.0, "step": 8615 }, { "epoch": 2.543523163174978, "grad_norm": 0.28564117417440066, "learning_rate": 8.459016393442623e-06, "loss": 0.3729, "num_tokens": 18133365.0, "step": 8620 }, { "epoch": 2.5449985246385367, "grad_norm": 0.34601563197099056, "learning_rate": 8.43169398907104e-06, "loss": 0.373, "num_tokens": 18834922.0, "step": 8625 }, { "epoch": 2.546473886102095, "grad_norm": 0.2495577666889519, "learning_rate": 8.404371584699453e-06, "loss": 0.4134, "num_tokens": 19672530.0, "step": 8630 }, { "epoch": 2.5479492475656533, "grad_norm": 0.27527172771914815, "learning_rate": 8.377049180327868e-06, "loss": 0.3843, "num_tokens": 20392719.0, "step": 8635 }, { "epoch": 2.549424609029212, "grad_norm": 0.2768839315897001, "learning_rate": 8.349726775956285e-06, "loss": 0.3989, "num_tokens": 21170458.0, "step": 8640 }, { "epoch": 2.550899970492771, "grad_norm": 0.250293439205322, "learning_rate": 8.3224043715847e-06, "loss": 0.3887, "num_tokens": 21938185.0, "step": 8645 }, { "epoch": 2.552375331956329, "grad_norm": 0.2691569594837882, "learning_rate": 8.295081967213115e-06, "loss": 0.388, "num_tokens": 22704634.0, "step": 8650 }, { "epoch": 2.553850693419888, "grad_norm": 0.2564567205167326, "learning_rate": 8.26775956284153e-06, "loss": 0.3916, "num_tokens": 23463691.0, "step": 8655 }, { "epoch": 2.555326054883446, "grad_norm": 0.26952040880658384, "learning_rate": 8.240437158469947e-06, "loss": 0.3837, "num_tokens": 24157075.0, "step": 8660 }, { "epoch": 2.556801416347005, "grad_norm": 0.26596569023243655, "learning_rate": 8.213114754098362e-06, "loss": 0.409, "num_tokens": 24936571.0, "step": 8665 }, { "epoch": 2.5582767778105637, "grad_norm": 0.27986417326543683, "learning_rate": 8.185792349726775e-06, "loss": 0.3973, "num_tokens": 25696483.0, "step": 8670 }, { "epoch": 2.559752139274122, "grad_norm": 0.2818510943658345, "learning_rate": 8.158469945355192e-06, "loss": 0.385, "num_tokens": 26459906.0, "step": 8675 }, { "epoch": 2.561227500737681, "grad_norm": 0.27209910903632684, "learning_rate": 8.131147540983607e-06, "loss": 0.3794, "num_tokens": 27195977.0, "step": 8680 }, { "epoch": 2.562702862201239, "grad_norm": 0.25708381404454617, "learning_rate": 8.103825136612022e-06, "loss": 0.3974, "num_tokens": 27958903.0, "step": 8685 }, { "epoch": 2.564178223664798, "grad_norm": 0.27308807755553594, "learning_rate": 8.076502732240437e-06, "loss": 0.3899, "num_tokens": 28748425.0, "step": 8690 }, { "epoch": 2.5656535851283566, "grad_norm": 0.26651630423460154, "learning_rate": 8.049180327868854e-06, "loss": 0.3929, "num_tokens": 29532246.0, "step": 8695 }, { "epoch": 2.567128946591915, "grad_norm": 0.2752851437137757, "learning_rate": 8.021857923497269e-06, "loss": 0.3879, "num_tokens": 30306330.0, "step": 8700 }, { "epoch": 2.5686043080554737, "grad_norm": 0.2798879110780839, "learning_rate": 7.994535519125684e-06, "loss": 0.3845, "num_tokens": 31039072.0, "step": 8705 }, { "epoch": 2.570079669519032, "grad_norm": 0.28954481211362737, "learning_rate": 7.967213114754099e-06, "loss": 0.3827, "num_tokens": 31805750.0, "step": 8710 }, { "epoch": 2.5715550309825908, "grad_norm": 0.24450535202032211, "learning_rate": 7.939890710382514e-06, "loss": 0.3947, "num_tokens": 32622718.0, "step": 8715 }, { "epoch": 2.5730303924461495, "grad_norm": 0.2576084070218537, "learning_rate": 7.91256830601093e-06, "loss": 0.393, "num_tokens": 33398647.0, "step": 8720 }, { "epoch": 2.574505753909708, "grad_norm": 0.29697505506110616, "learning_rate": 7.885245901639344e-06, "loss": 0.3991, "num_tokens": 34189897.0, "step": 8725 }, { "epoch": 2.5759811153732666, "grad_norm": 0.2996745429023728, "learning_rate": 7.857923497267759e-06, "loss": 0.3597, "num_tokens": 34830509.0, "step": 8730 }, { "epoch": 2.577456476836825, "grad_norm": 0.2592448206148604, "learning_rate": 7.830601092896175e-06, "loss": 0.3956, "num_tokens": 35627373.0, "step": 8735 }, { "epoch": 2.5789318383003836, "grad_norm": 0.2903745951258576, "learning_rate": 7.80327868852459e-06, "loss": 0.4099, "num_tokens": 36423509.0, "step": 8740 }, { "epoch": 2.5804071997639424, "grad_norm": 0.2929673329081383, "learning_rate": 7.775956284153005e-06, "loss": 0.3911, "num_tokens": 37172506.0, "step": 8745 }, { "epoch": 2.5818825612275007, "grad_norm": 0.2805053335555033, "learning_rate": 7.74863387978142e-06, "loss": 0.3688, "num_tokens": 37878514.0, "step": 8750 }, { "epoch": 2.5833579226910595, "grad_norm": 0.2851219797688124, "learning_rate": 7.721311475409837e-06, "loss": 0.4084, "num_tokens": 38687157.0, "step": 8755 }, { "epoch": 2.5848332841546178, "grad_norm": 0.3123299694569007, "learning_rate": 7.693989071038252e-06, "loss": 0.3715, "num_tokens": 39382530.0, "step": 8760 }, { "epoch": 2.5863086456181765, "grad_norm": 0.2949751053594237, "learning_rate": 7.666666666666667e-06, "loss": 0.4083, "num_tokens": 40157271.0, "step": 8765 }, { "epoch": 2.5877840070817353, "grad_norm": 0.27173771358709947, "learning_rate": 7.639344262295082e-06, "loss": 0.3896, "num_tokens": 40863201.0, "step": 8770 }, { "epoch": 2.5892593685452936, "grad_norm": 0.2723343634454667, "learning_rate": 7.612021857923497e-06, "loss": 0.4072, "num_tokens": 41633729.0, "step": 8775 }, { "epoch": 2.590734730008852, "grad_norm": 0.28690567977150916, "learning_rate": 7.584699453551913e-06, "loss": 0.3736, "num_tokens": 42370012.0, "step": 8780 }, { "epoch": 2.5922100914724107, "grad_norm": 0.2709529774280743, "learning_rate": 7.557377049180328e-06, "loss": 0.3821, "num_tokens": 43097494.0, "step": 8785 }, { "epoch": 2.5936854529359694, "grad_norm": 0.2764661047237489, "learning_rate": 7.530054644808744e-06, "loss": 0.3899, "num_tokens": 43880022.0, "step": 8790 }, { "epoch": 2.5951608143995277, "grad_norm": 0.27958973960361966, "learning_rate": 7.502732240437159e-06, "loss": 0.3839, "num_tokens": 44645402.0, "step": 8795 }, { "epoch": 2.5966361758630865, "grad_norm": 0.2699122648043947, "learning_rate": 7.475409836065573e-06, "loss": 0.374, "num_tokens": 45357153.0, "step": 8800 }, { "epoch": 2.598111537326645, "grad_norm": 0.27269760987458075, "learning_rate": 7.44808743169399e-06, "loss": 0.387, "num_tokens": 46032636.0, "step": 8805 }, { "epoch": 2.5995868987902035, "grad_norm": 0.2945586297786851, "learning_rate": 7.420765027322404e-06, "loss": 0.3831, "num_tokens": 46755117.0, "step": 8810 }, { "epoch": 2.6010622602537623, "grad_norm": 0.26976501827746246, "learning_rate": 7.393442622950821e-06, "loss": 0.3943, "num_tokens": 47537786.0, "step": 8815 }, { "epoch": 2.6025376217173206, "grad_norm": 0.28663380130704197, "learning_rate": 7.366120218579235e-06, "loss": 0.3867, "num_tokens": 48251420.0, "step": 8820 }, { "epoch": 2.6040129831808794, "grad_norm": 0.28843862506906426, "learning_rate": 7.33879781420765e-06, "loss": 0.3829, "num_tokens": 49035190.0, "step": 8825 }, { "epoch": 2.6054883446444377, "grad_norm": 0.29037375326934334, "learning_rate": 7.311475409836066e-06, "loss": 0.4016, "num_tokens": 49811277.0, "step": 8830 }, { "epoch": 2.6069637061079964, "grad_norm": 0.2895760170980748, "learning_rate": 7.284153005464481e-06, "loss": 0.391, "num_tokens": 50524624.0, "step": 8835 }, { "epoch": 2.608439067571555, "grad_norm": 0.2849240584657679, "learning_rate": 7.256830601092897e-06, "loss": 0.3814, "num_tokens": 51264867.0, "step": 8840 }, { "epoch": 2.6099144290351135, "grad_norm": 0.25049280729287726, "learning_rate": 7.229508196721312e-06, "loss": 0.3815, "num_tokens": 52014079.0, "step": 8845 }, { "epoch": 2.6113897904986723, "grad_norm": 0.27941997610067904, "learning_rate": 7.202185792349728e-06, "loss": 0.3958, "num_tokens": 52825373.0, "step": 8850 }, { "epoch": 2.6128651519622306, "grad_norm": 0.25347680598938593, "learning_rate": 7.174863387978143e-06, "loss": 0.4006, "num_tokens": 53612374.0, "step": 8855 }, { "epoch": 2.6143405134257893, "grad_norm": 0.27088901357767114, "learning_rate": 7.147540983606557e-06, "loss": 0.3957, "num_tokens": 54403197.0, "step": 8860 }, { "epoch": 2.615815874889348, "grad_norm": 0.2661768872298797, "learning_rate": 7.1202185792349735e-06, "loss": 0.3827, "num_tokens": 55165558.0, "step": 8865 }, { "epoch": 2.6172912363529064, "grad_norm": 0.3095349295197197, "learning_rate": 7.092896174863388e-06, "loss": 0.3873, "num_tokens": 55881214.0, "step": 8870 }, { "epoch": 2.618766597816465, "grad_norm": 0.305267130687086, "learning_rate": 7.065573770491804e-06, "loss": 0.3909, "num_tokens": 56633062.0, "step": 8875 }, { "epoch": 2.6202419592800235, "grad_norm": 0.27648093000324664, "learning_rate": 7.0382513661202185e-06, "loss": 0.3825, "num_tokens": 57395878.0, "step": 8880 }, { "epoch": 2.621717320743582, "grad_norm": 0.2680726710710752, "learning_rate": 7.010928961748635e-06, "loss": 0.4003, "num_tokens": 58188436.0, "step": 8885 }, { "epoch": 2.623192682207141, "grad_norm": 0.28270356335109764, "learning_rate": 6.983606557377049e-06, "loss": 0.4035, "num_tokens": 58996385.0, "step": 8890 }, { "epoch": 2.6246680436706993, "grad_norm": 0.26832937033247983, "learning_rate": 6.9562841530054644e-06, "loss": 0.3838, "num_tokens": 59811489.0, "step": 8895 }, { "epoch": 2.626143405134258, "grad_norm": 0.2802734583030222, "learning_rate": 6.92896174863388e-06, "loss": 0.3789, "num_tokens": 60545536.0, "step": 8900 }, { "epoch": 2.6276187665978163, "grad_norm": 0.27323653845941465, "learning_rate": 6.901639344262295e-06, "loss": 0.3899, "num_tokens": 61323518.0, "step": 8905 }, { "epoch": 2.629094128061375, "grad_norm": 0.2838719117712319, "learning_rate": 6.874316939890711e-06, "loss": 0.3957, "num_tokens": 62051821.0, "step": 8910 }, { "epoch": 2.630569489524934, "grad_norm": 0.2841900650266135, "learning_rate": 6.846994535519126e-06, "loss": 0.3909, "num_tokens": 62821687.0, "step": 8915 }, { "epoch": 2.632044850988492, "grad_norm": 0.31033871660096957, "learning_rate": 6.819672131147542e-06, "loss": 0.389, "num_tokens": 63552743.0, "step": 8920 }, { "epoch": 2.6335202124520505, "grad_norm": 0.2883661269400399, "learning_rate": 6.792349726775957e-06, "loss": 0.3695, "num_tokens": 64278879.0, "step": 8925 }, { "epoch": 2.6349955739156092, "grad_norm": 0.26558658248792427, "learning_rate": 6.765027322404371e-06, "loss": 0.3986, "num_tokens": 65046395.0, "step": 8930 }, { "epoch": 2.636470935379168, "grad_norm": 0.28594391631772037, "learning_rate": 6.737704918032788e-06, "loss": 0.3766, "num_tokens": 65757827.0, "step": 8935 }, { "epoch": 2.6379462968427267, "grad_norm": 0.29970365466857984, "learning_rate": 6.710382513661202e-06, "loss": 0.3842, "num_tokens": 66473976.0, "step": 8940 }, { "epoch": 2.639421658306285, "grad_norm": 0.28032319345669166, "learning_rate": 6.683060109289618e-06, "loss": 0.4056, "num_tokens": 67253780.0, "step": 8945 }, { "epoch": 2.6408970197698434, "grad_norm": 0.2571008845210928, "learning_rate": 6.655737704918033e-06, "loss": 0.4017, "num_tokens": 68044392.0, "step": 8950 }, { "epoch": 2.642372381233402, "grad_norm": 0.2677164585629073, "learning_rate": 6.628415300546448e-06, "loss": 0.3784, "num_tokens": 68749110.0, "step": 8955 }, { "epoch": 2.643847742696961, "grad_norm": 0.24062966516270748, "learning_rate": 6.601092896174864e-06, "loss": 0.3836, "num_tokens": 69474755.0, "step": 8960 }, { "epoch": 2.645323104160519, "grad_norm": 0.2866753550876129, "learning_rate": 6.573770491803279e-06, "loss": 0.3789, "num_tokens": 70209004.0, "step": 8965 }, { "epoch": 2.646798465624078, "grad_norm": 0.27467286283694836, "learning_rate": 6.546448087431695e-06, "loss": 0.3839, "num_tokens": 70938737.0, "step": 8970 }, { "epoch": 2.6482738270876363, "grad_norm": 0.289876267945408, "learning_rate": 6.51912568306011e-06, "loss": 0.4008, "num_tokens": 71655095.0, "step": 8975 }, { "epoch": 2.649749188551195, "grad_norm": 0.27961794714585425, "learning_rate": 6.491803278688526e-06, "loss": 0.4034, "num_tokens": 72410103.0, "step": 8980 }, { "epoch": 2.6512245500147538, "grad_norm": 0.25583793870232735, "learning_rate": 6.46448087431694e-06, "loss": 0.3957, "num_tokens": 73140240.0, "step": 8985 }, { "epoch": 2.652699911478312, "grad_norm": 0.2578519151617952, "learning_rate": 6.437158469945355e-06, "loss": 0.4062, "num_tokens": 73897274.0, "step": 8990 }, { "epoch": 2.654175272941871, "grad_norm": 0.25657191738045637, "learning_rate": 6.409836065573771e-06, "loss": 0.3984, "num_tokens": 74677157.0, "step": 8995 }, { "epoch": 2.655650634405429, "grad_norm": 0.2869143936164678, "learning_rate": 6.382513661202186e-06, "loss": 0.4014, "num_tokens": 75430322.0, "step": 9000 }, { "epoch": 2.657125995868988, "grad_norm": 0.26999879834184537, "learning_rate": 6.3551912568306016e-06, "loss": 0.387, "num_tokens": 76201148.0, "step": 9005 }, { "epoch": 2.6586013573325467, "grad_norm": 0.2659707297299653, "learning_rate": 6.3278688524590166e-06, "loss": 0.3861, "num_tokens": 76913886.0, "step": 9010 }, { "epoch": 2.660076718796105, "grad_norm": 0.27770754464016667, "learning_rate": 6.3005464480874324e-06, "loss": 0.3898, "num_tokens": 77680721.0, "step": 9015 }, { "epoch": 2.6615520802596637, "grad_norm": 0.24648627902816905, "learning_rate": 6.2732240437158475e-06, "loss": 0.4103, "num_tokens": 78463300.0, "step": 9020 }, { "epoch": 2.663027441723222, "grad_norm": 0.24403593414155605, "learning_rate": 6.2459016393442625e-06, "loss": 0.3748, "num_tokens": 79260607.0, "step": 9025 }, { "epoch": 2.664502803186781, "grad_norm": 0.23967995098682454, "learning_rate": 6.218579234972678e-06, "loss": 0.4141, "num_tokens": 80087675.0, "step": 9030 }, { "epoch": 2.6659781646503395, "grad_norm": 0.2561806210761765, "learning_rate": 6.1912568306010925e-06, "loss": 0.3986, "num_tokens": 80903058.0, "step": 9035 }, { "epoch": 2.667453526113898, "grad_norm": 0.2595225189221605, "learning_rate": 6.163934426229508e-06, "loss": 0.3898, "num_tokens": 81637947.0, "step": 9040 }, { "epoch": 2.6689288875774566, "grad_norm": 0.2622307474031872, "learning_rate": 6.136612021857923e-06, "loss": 0.3805, "num_tokens": 82341284.0, "step": 9045 }, { "epoch": 2.670404249041015, "grad_norm": 0.30206275283463857, "learning_rate": 6.109289617486339e-06, "loss": 0.3764, "num_tokens": 83044605.0, "step": 9050 }, { "epoch": 2.6718796105045737, "grad_norm": 0.3265872248263865, "learning_rate": 6.081967213114754e-06, "loss": 0.3994, "num_tokens": 83789903.0, "step": 9055 }, { "epoch": 2.6733549719681324, "grad_norm": 0.3278214539697124, "learning_rate": 6.05464480874317e-06, "loss": 0.3942, "num_tokens": 84470595.0, "step": 9060 }, { "epoch": 2.6748303334316907, "grad_norm": 0.2595172056658916, "learning_rate": 6.027322404371584e-06, "loss": 0.3966, "num_tokens": 85272781.0, "step": 9065 }, { "epoch": 2.6763056948952495, "grad_norm": 0.2476798910203527, "learning_rate": 6e-06, "loss": 0.3975, "num_tokens": 86050270.0, "step": 9070 }, { "epoch": 2.677781056358808, "grad_norm": 0.2671340698934059, "learning_rate": 5.972677595628415e-06, "loss": 0.3908, "num_tokens": 86773413.0, "step": 9075 }, { "epoch": 2.6792564178223666, "grad_norm": 0.27316049272225823, "learning_rate": 5.945355191256831e-06, "loss": 0.3857, "num_tokens": 87564606.0, "step": 9080 }, { "epoch": 2.6807317792859253, "grad_norm": 0.29251023935147275, "learning_rate": 5.918032786885246e-06, "loss": 0.3833, "num_tokens": 88292365.0, "step": 9085 }, { "epoch": 2.6822071407494836, "grad_norm": 0.3271972980135341, "learning_rate": 5.890710382513662e-06, "loss": 0.3883, "num_tokens": 89008681.0, "step": 9090 }, { "epoch": 2.683682502213042, "grad_norm": 0.30800545213164565, "learning_rate": 5.863387978142077e-06, "loss": 0.3936, "num_tokens": 89736846.0, "step": 9095 }, { "epoch": 2.6851578636766007, "grad_norm": 0.28339213621512926, "learning_rate": 5.836065573770492e-06, "loss": 0.4003, "num_tokens": 90510970.0, "step": 9100 }, { "epoch": 2.6866332251401595, "grad_norm": 0.284993447818331, "learning_rate": 5.808743169398907e-06, "loss": 0.402, "num_tokens": 91348032.0, "step": 9105 }, { "epoch": 2.6881085866037178, "grad_norm": 0.24935133823242064, "learning_rate": 5.781420765027323e-06, "loss": 0.3896, "num_tokens": 92141808.0, "step": 9110 }, { "epoch": 2.6895839480672765, "grad_norm": 0.2714461416693103, "learning_rate": 5.754098360655738e-06, "loss": 0.4019, "num_tokens": 92898212.0, "step": 9115 }, { "epoch": 2.691059309530835, "grad_norm": 0.2630182696999986, "learning_rate": 5.726775956284154e-06, "loss": 0.3939, "num_tokens": 93633590.0, "step": 9120 }, { "epoch": 2.6925346709943936, "grad_norm": 0.2662270195402974, "learning_rate": 5.699453551912569e-06, "loss": 0.4016, "num_tokens": 94455756.0, "step": 9125 }, { "epoch": 2.6940100324579523, "grad_norm": 0.2655852627889512, "learning_rate": 5.672131147540984e-06, "loss": 0.3876, "num_tokens": 95216309.0, "step": 9130 }, { "epoch": 2.6954853939215107, "grad_norm": 0.2672850188897333, "learning_rate": 5.644808743169399e-06, "loss": 0.4007, "num_tokens": 95990445.0, "step": 9135 }, { "epoch": 2.6969607553850694, "grad_norm": 0.2527119775009369, "learning_rate": 5.617486338797815e-06, "loss": 0.3777, "num_tokens": 96753081.0, "step": 9140 }, { "epoch": 2.6984361168486277, "grad_norm": 0.26326210324525345, "learning_rate": 5.59016393442623e-06, "loss": 0.3786, "num_tokens": 97477778.0, "step": 9145 }, { "epoch": 2.6999114783121865, "grad_norm": 0.28052349898578477, "learning_rate": 5.5628415300546455e-06, "loss": 0.3888, "num_tokens": 98237609.0, "step": 9150 }, { "epoch": 2.7013868397757452, "grad_norm": 0.2512436579690796, "learning_rate": 5.5355191256830605e-06, "loss": 0.402, "num_tokens": 98985497.0, "step": 9155 }, { "epoch": 2.7028622012393035, "grad_norm": 0.2632402753509915, "learning_rate": 5.508196721311476e-06, "loss": 0.3848, "num_tokens": 99719335.0, "step": 9160 }, { "epoch": 2.7043375627028623, "grad_norm": 0.25839811164675336, "learning_rate": 5.4808743169398905e-06, "loss": 0.3871, "num_tokens": 100478301.0, "step": 9165 }, { "epoch": 2.7058129241664206, "grad_norm": 0.31486895567693307, "learning_rate": 5.453551912568306e-06, "loss": 0.3978, "num_tokens": 101259689.0, "step": 9170 }, { "epoch": 2.7072882856299794, "grad_norm": 0.2811593700460673, "learning_rate": 5.426229508196721e-06, "loss": 0.3826, "num_tokens": 101991301.0, "step": 9175 }, { "epoch": 2.708763647093538, "grad_norm": 0.25985778825589234, "learning_rate": 5.398907103825137e-06, "loss": 0.391, "num_tokens": 102782131.0, "step": 9180 }, { "epoch": 2.7102390085570964, "grad_norm": 0.26872801379966016, "learning_rate": 5.371584699453552e-06, "loss": 0.3881, "num_tokens": 103579146.0, "step": 9185 }, { "epoch": 2.711714370020655, "grad_norm": 0.24718482601829025, "learning_rate": 5.344262295081968e-06, "loss": 0.3972, "num_tokens": 104388913.0, "step": 9190 }, { "epoch": 2.7131897314842135, "grad_norm": 0.26628651442015605, "learning_rate": 5.316939890710382e-06, "loss": 0.3939, "num_tokens": 105176199.0, "step": 9195 }, { "epoch": 2.7146650929477723, "grad_norm": 0.254559294270911, "learning_rate": 5.289617486338798e-06, "loss": 0.4014, "num_tokens": 105994896.0, "step": 9200 }, { "epoch": 2.716140454411331, "grad_norm": 0.2671600412278785, "learning_rate": 5.262295081967213e-06, "loss": 0.3972, "num_tokens": 106757596.0, "step": 9205 }, { "epoch": 2.7176158158748893, "grad_norm": 0.2980396662303423, "learning_rate": 5.234972677595629e-06, "loss": 0.3814, "num_tokens": 107461268.0, "step": 9210 }, { "epoch": 2.719091177338448, "grad_norm": 0.2756244728814652, "learning_rate": 5.207650273224044e-06, "loss": 0.391, "num_tokens": 108220224.0, "step": 9215 }, { "epoch": 2.7205665388020064, "grad_norm": 0.2771017664380777, "learning_rate": 5.180327868852459e-06, "loss": 0.3865, "num_tokens": 108981773.0, "step": 9220 }, { "epoch": 2.722041900265565, "grad_norm": 0.2876831945258403, "learning_rate": 5.153005464480874e-06, "loss": 0.3783, "num_tokens": 109731488.0, "step": 9225 }, { "epoch": 2.723517261729124, "grad_norm": 0.2675900036202732, "learning_rate": 5.12568306010929e-06, "loss": 0.4059, "num_tokens": 110486161.0, "step": 9230 }, { "epoch": 2.724992623192682, "grad_norm": 0.28757518975061236, "learning_rate": 5.098360655737705e-06, "loss": 0.4022, "num_tokens": 111214560.0, "step": 9235 }, { "epoch": 2.7264679846562405, "grad_norm": 0.26218834616555214, "learning_rate": 5.071038251366121e-06, "loss": 0.3958, "num_tokens": 111960079.0, "step": 9240 }, { "epoch": 2.7279433461197993, "grad_norm": 0.2907426012141948, "learning_rate": 5.043715846994536e-06, "loss": 0.3885, "num_tokens": 112694179.0, "step": 9245 }, { "epoch": 2.729418707583358, "grad_norm": 0.2640851460452626, "learning_rate": 5.016393442622951e-06, "loss": 0.4078, "num_tokens": 113473624.0, "step": 9250 }, { "epoch": 2.7308940690469163, "grad_norm": 0.33634354340406514, "learning_rate": 4.989071038251367e-06, "loss": 0.3877, "num_tokens": 114212715.0, "step": 9255 }, { "epoch": 2.732369430510475, "grad_norm": 0.274714967159584, "learning_rate": 4.961748633879782e-06, "loss": 0.3921, "num_tokens": 114986923.0, "step": 9260 }, { "epoch": 2.7338447919740334, "grad_norm": 0.2504541809316926, "learning_rate": 4.934426229508197e-06, "loss": 0.3812, "num_tokens": 115757610.0, "step": 9265 }, { "epoch": 2.735320153437592, "grad_norm": 0.28291334548906355, "learning_rate": 4.907103825136612e-06, "loss": 0.3857, "num_tokens": 116542663.0, "step": 9270 }, { "epoch": 2.736795514901151, "grad_norm": 0.2525440490263362, "learning_rate": 4.879781420765028e-06, "loss": 0.3897, "num_tokens": 117324981.0, "step": 9275 }, { "epoch": 2.7382708763647092, "grad_norm": 0.25457272231764766, "learning_rate": 4.852459016393443e-06, "loss": 0.3806, "num_tokens": 118037025.0, "step": 9280 }, { "epoch": 2.739746237828268, "grad_norm": 0.27410582126061106, "learning_rate": 4.8251366120218585e-06, "loss": 0.3824, "num_tokens": 118823968.0, "step": 9285 }, { "epoch": 2.7412215992918263, "grad_norm": 0.2764467229602498, "learning_rate": 4.7978142076502736e-06, "loss": 0.3945, "num_tokens": 119600538.0, "step": 9290 }, { "epoch": 2.742696960755385, "grad_norm": 0.2552196910530365, "learning_rate": 4.7704918032786886e-06, "loss": 0.3845, "num_tokens": 120358258.0, "step": 9295 }, { "epoch": 2.744172322218944, "grad_norm": 0.29027780725298136, "learning_rate": 4.743169398907104e-06, "loss": 0.3754, "num_tokens": 121091796.0, "step": 9300 }, { "epoch": 2.745647683682502, "grad_norm": 0.2612214661721622, "learning_rate": 4.7158469945355195e-06, "loss": 0.3741, "num_tokens": 121853222.0, "step": 9305 }, { "epoch": 2.747123045146061, "grad_norm": 0.2654797020306799, "learning_rate": 4.6885245901639345e-06, "loss": 0.3903, "num_tokens": 122600374.0, "step": 9310 }, { "epoch": 2.748598406609619, "grad_norm": 0.2888121243417279, "learning_rate": 4.66120218579235e-06, "loss": 0.4004, "num_tokens": 123339294.0, "step": 9315 }, { "epoch": 2.750073768073178, "grad_norm": 0.2806577259195079, "learning_rate": 4.633879781420765e-06, "loss": 0.3925, "num_tokens": 124101541.0, "step": 9320 }, { "epoch": 2.7515491295367367, "grad_norm": 0.27891936486997315, "learning_rate": 4.60655737704918e-06, "loss": 0.3721, "num_tokens": 124800211.0, "step": 9325 }, { "epoch": 2.753024491000295, "grad_norm": 0.2576367651564353, "learning_rate": 4.579234972677595e-06, "loss": 0.3932, "num_tokens": 125530092.0, "step": 9330 }, { "epoch": 2.7544998524638538, "grad_norm": 0.27822028634842594, "learning_rate": 4.551912568306011e-06, "loss": 0.4024, "num_tokens": 126298237.0, "step": 9335 }, { "epoch": 2.755975213927412, "grad_norm": 0.2963728805976681, "learning_rate": 4.524590163934426e-06, "loss": 0.3828, "num_tokens": 127090834.0, "step": 9340 }, { "epoch": 2.757450575390971, "grad_norm": 0.29124850146314873, "learning_rate": 4.497267759562842e-06, "loss": 0.3704, "num_tokens": 127741052.0, "step": 9345 }, { "epoch": 2.7589259368545296, "grad_norm": 0.28531767798353974, "learning_rate": 4.469945355191257e-06, "loss": 0.3966, "num_tokens": 128461053.0, "step": 9350 }, { "epoch": 2.760401298318088, "grad_norm": 0.2664423227109161, "learning_rate": 4.442622950819672e-06, "loss": 0.4031, "num_tokens": 129252823.0, "step": 9355 }, { "epoch": 2.7618766597816466, "grad_norm": 0.2765547646797154, "learning_rate": 4.415300546448087e-06, "loss": 0.3816, "num_tokens": 129979606.0, "step": 9360 }, { "epoch": 2.763352021245205, "grad_norm": 0.2937877701439732, "learning_rate": 4.387978142076503e-06, "loss": 0.3957, "num_tokens": 130719538.0, "step": 9365 }, { "epoch": 2.7648273827087637, "grad_norm": 0.25774302806241806, "learning_rate": 4.360655737704918e-06, "loss": 0.4021, "num_tokens": 131477880.0, "step": 9370 }, { "epoch": 2.7663027441723225, "grad_norm": 0.2634176133313534, "learning_rate": 4.333333333333334e-06, "loss": 0.3714, "num_tokens": 132179641.0, "step": 9375 }, { "epoch": 2.767778105635881, "grad_norm": 0.27989046154836505, "learning_rate": 4.306010928961749e-06, "loss": 0.3837, "num_tokens": 132887504.0, "step": 9380 }, { "epoch": 2.769253467099439, "grad_norm": 0.25356678367480684, "learning_rate": 4.278688524590164e-06, "loss": 0.3949, "num_tokens": 133662432.0, "step": 9385 }, { "epoch": 2.770728828562998, "grad_norm": 0.26118416109104386, "learning_rate": 4.251366120218579e-06, "loss": 0.3934, "num_tokens": 134443862.0, "step": 9390 }, { "epoch": 2.7722041900265566, "grad_norm": 0.26553187568261116, "learning_rate": 4.224043715846995e-06, "loss": 0.3839, "num_tokens": 135197096.0, "step": 9395 }, { "epoch": 2.773679551490115, "grad_norm": 0.2578220852345799, "learning_rate": 4.19672131147541e-06, "loss": 0.3947, "num_tokens": 135936632.0, "step": 9400 }, { "epoch": 2.7751549129536737, "grad_norm": 0.2787684228621493, "learning_rate": 4.169398907103826e-06, "loss": 0.3886, "num_tokens": 136692160.0, "step": 9405 }, { "epoch": 2.776630274417232, "grad_norm": 0.2803113002495992, "learning_rate": 4.142076502732241e-06, "loss": 0.3885, "num_tokens": 137429940.0, "step": 9410 }, { "epoch": 2.7781056358807907, "grad_norm": 0.27137458541347087, "learning_rate": 4.114754098360657e-06, "loss": 0.3833, "num_tokens": 138168198.0, "step": 9415 }, { "epoch": 2.7795809973443495, "grad_norm": 0.2694182013657766, "learning_rate": 4.087431693989071e-06, "loss": 0.3913, "num_tokens": 138970361.0, "step": 9420 }, { "epoch": 2.781056358807908, "grad_norm": 0.24980830664126383, "learning_rate": 4.060109289617487e-06, "loss": 0.3948, "num_tokens": 139742476.0, "step": 9425 }, { "epoch": 2.7825317202714666, "grad_norm": 0.2742538794242408, "learning_rate": 4.032786885245902e-06, "loss": 0.3884, "num_tokens": 140532197.0, "step": 9430 }, { "epoch": 2.784007081735025, "grad_norm": 0.28299064464105356, "learning_rate": 4.0054644808743175e-06, "loss": 0.3685, "num_tokens": 141268169.0, "step": 9435 }, { "epoch": 2.7854824431985836, "grad_norm": 0.24873609822531526, "learning_rate": 3.9781420765027325e-06, "loss": 0.3822, "num_tokens": 142054655.0, "step": 9440 }, { "epoch": 2.7869578046621424, "grad_norm": 0.27779701629972087, "learning_rate": 3.950819672131148e-06, "loss": 0.4024, "num_tokens": 142856627.0, "step": 9445 }, { "epoch": 2.7884331661257007, "grad_norm": 0.2730830410631186, "learning_rate": 3.9234972677595625e-06, "loss": 0.3806, "num_tokens": 143595194.0, "step": 9450 }, { "epoch": 2.7899085275892594, "grad_norm": 0.29334813599799797, "learning_rate": 3.896174863387978e-06, "loss": 0.3926, "num_tokens": 144374020.0, "step": 9455 }, { "epoch": 2.7913838890528178, "grad_norm": 0.2930580275273647, "learning_rate": 3.868852459016393e-06, "loss": 0.3771, "num_tokens": 145115699.0, "step": 9460 }, { "epoch": 2.7928592505163765, "grad_norm": 0.254921900743752, "learning_rate": 3.841530054644809e-06, "loss": 0.3905, "num_tokens": 145830595.0, "step": 9465 }, { "epoch": 2.7943346119799353, "grad_norm": 0.27418355424110685, "learning_rate": 3.8142076502732243e-06, "loss": 0.3842, "num_tokens": 146560909.0, "step": 9470 }, { "epoch": 2.7958099734434936, "grad_norm": 0.30899366278755797, "learning_rate": 3.7868852459016397e-06, "loss": 0.379, "num_tokens": 147279857.0, "step": 9475 }, { "epoch": 2.7972853349070523, "grad_norm": 0.27285368766967466, "learning_rate": 3.759562841530055e-06, "loss": 0.3702, "num_tokens": 147990538.0, "step": 9480 }, { "epoch": 2.7987606963706106, "grad_norm": 0.3305522827661096, "learning_rate": 3.7322404371584698e-06, "loss": 0.3715, "num_tokens": 148702368.0, "step": 9485 }, { "epoch": 2.8002360578341694, "grad_norm": 0.25740313656826336, "learning_rate": 3.704918032786885e-06, "loss": 0.3919, "num_tokens": 149508012.0, "step": 9490 }, { "epoch": 2.801711419297728, "grad_norm": 0.26315855389384124, "learning_rate": 3.6775956284153006e-06, "loss": 0.3739, "num_tokens": 150267002.0, "step": 9495 }, { "epoch": 2.8031867807612865, "grad_norm": 0.28052279608187597, "learning_rate": 3.650273224043716e-06, "loss": 0.3653, "num_tokens": 150944630.0, "step": 9500 }, { "epoch": 2.8046621422248452, "grad_norm": 0.2959379382581608, "learning_rate": 3.6229508196721315e-06, "loss": 0.4009, "num_tokens": 151754096.0, "step": 9505 }, { "epoch": 2.8061375036884035, "grad_norm": 0.27264770965555085, "learning_rate": 3.595628415300547e-06, "loss": 0.3628, "num_tokens": 152445631.0, "step": 9510 }, { "epoch": 2.8076128651519623, "grad_norm": 0.27036862398884187, "learning_rate": 3.5683060109289616e-06, "loss": 0.3873, "num_tokens": 153231092.0, "step": 9515 }, { "epoch": 2.809088226615521, "grad_norm": 0.2689934144800254, "learning_rate": 3.540983606557377e-06, "loss": 0.3822, "num_tokens": 153967724.0, "step": 9520 }, { "epoch": 2.8105635880790794, "grad_norm": 0.2509343587095815, "learning_rate": 3.5136612021857924e-06, "loss": 0.3954, "num_tokens": 154819057.0, "step": 9525 }, { "epoch": 2.812038949542638, "grad_norm": 0.2608155084988626, "learning_rate": 3.486338797814208e-06, "loss": 0.3984, "num_tokens": 155559134.0, "step": 9530 }, { "epoch": 2.8135143110061964, "grad_norm": 0.29885873617741154, "learning_rate": 3.4590163934426233e-06, "loss": 0.3804, "num_tokens": 156276224.0, "step": 9535 }, { "epoch": 2.814989672469755, "grad_norm": 0.2766415029580258, "learning_rate": 3.4316939890710388e-06, "loss": 0.3871, "num_tokens": 157020096.0, "step": 9540 }, { "epoch": 2.816465033933314, "grad_norm": 0.278759112765328, "learning_rate": 3.4043715846994534e-06, "loss": 0.3863, "num_tokens": 157772244.0, "step": 9545 }, { "epoch": 2.8179403953968722, "grad_norm": 0.2598638923813204, "learning_rate": 3.377049180327869e-06, "loss": 0.4035, "num_tokens": 158597109.0, "step": 9550 }, { "epoch": 2.8194157568604306, "grad_norm": 0.2669369404275867, "learning_rate": 3.3497267759562842e-06, "loss": 0.3915, "num_tokens": 159369044.0, "step": 9555 }, { "epoch": 2.8208911183239893, "grad_norm": 0.30485776533095227, "learning_rate": 3.3224043715846997e-06, "loss": 0.3862, "num_tokens": 160176482.0, "step": 9560 }, { "epoch": 2.822366479787548, "grad_norm": 0.2739864008335274, "learning_rate": 3.295081967213115e-06, "loss": 0.3696, "num_tokens": 160944944.0, "step": 9565 }, { "epoch": 2.8238418412511064, "grad_norm": 0.26828814562175435, "learning_rate": 3.2677595628415305e-06, "loss": 0.4018, "num_tokens": 161742784.0, "step": 9570 }, { "epoch": 2.825317202714665, "grad_norm": 0.2923966134562494, "learning_rate": 3.240437158469946e-06, "loss": 0.3876, "num_tokens": 162487764.0, "step": 9575 }, { "epoch": 2.8267925641782234, "grad_norm": 0.2662651988875061, "learning_rate": 3.2131147540983606e-06, "loss": 0.3869, "num_tokens": 163243464.0, "step": 9580 }, { "epoch": 2.828267925641782, "grad_norm": 0.26460158255994226, "learning_rate": 3.185792349726776e-06, "loss": 0.3912, "num_tokens": 164033248.0, "step": 9585 }, { "epoch": 2.829743287105341, "grad_norm": 0.26002208814349226, "learning_rate": 3.1584699453551915e-06, "loss": 0.3582, "num_tokens": 164745020.0, "step": 9590 }, { "epoch": 2.8312186485688993, "grad_norm": 0.25874540832155474, "learning_rate": 3.131147540983607e-06, "loss": 0.3934, "num_tokens": 165538194.0, "step": 9595 }, { "epoch": 2.832694010032458, "grad_norm": 0.2899317512479807, "learning_rate": 3.1038251366120223e-06, "loss": 0.3966, "num_tokens": 166273301.0, "step": 9600 }, { "epoch": 2.8341693714960163, "grad_norm": 0.26423565721414927, "learning_rate": 3.0765027322404374e-06, "loss": 0.3978, "num_tokens": 167058800.0, "step": 9605 }, { "epoch": 2.835644732959575, "grad_norm": 0.27013118400919944, "learning_rate": 3.049180327868853e-06, "loss": 0.3668, "num_tokens": 167750146.0, "step": 9610 }, { "epoch": 2.837120094423134, "grad_norm": 0.28787475975945376, "learning_rate": 3.0218579234972682e-06, "loss": 0.3793, "num_tokens": 168465045.0, "step": 9615 }, { "epoch": 2.838595455886692, "grad_norm": 0.2638101960986424, "learning_rate": 2.9945355191256832e-06, "loss": 0.3924, "num_tokens": 169227167.0, "step": 9620 }, { "epoch": 2.840070817350251, "grad_norm": 0.2862038491683355, "learning_rate": 2.9672131147540987e-06, "loss": 0.3867, "num_tokens": 169953500.0, "step": 9625 }, { "epoch": 2.841546178813809, "grad_norm": 0.2538628617181889, "learning_rate": 2.939890710382514e-06, "loss": 0.3997, "num_tokens": 170725608.0, "step": 9630 }, { "epoch": 2.843021540277368, "grad_norm": 0.26240936454610364, "learning_rate": 2.912568306010929e-06, "loss": 0.3913, "num_tokens": 171481569.0, "step": 9635 }, { "epoch": 2.8444969017409267, "grad_norm": 0.26073005602831417, "learning_rate": 2.8852459016393446e-06, "loss": 0.3877, "num_tokens": 172237090.0, "step": 9640 }, { "epoch": 2.845972263204485, "grad_norm": 0.2524973547687659, "learning_rate": 2.8579234972677596e-06, "loss": 0.3911, "num_tokens": 172988876.0, "step": 9645 }, { "epoch": 2.847447624668044, "grad_norm": 0.2889109265549457, "learning_rate": 2.830601092896175e-06, "loss": 0.3769, "num_tokens": 173681232.0, "step": 9650 }, { "epoch": 2.848922986131602, "grad_norm": 0.29586098567147734, "learning_rate": 2.8032786885245905e-06, "loss": 0.3835, "num_tokens": 174429107.0, "step": 9655 }, { "epoch": 2.850398347595161, "grad_norm": 0.29347387446503775, "learning_rate": 2.7759562841530055e-06, "loss": 0.3911, "num_tokens": 175223887.0, "step": 9660 }, { "epoch": 2.8518737090587196, "grad_norm": 0.2665027024860678, "learning_rate": 2.748633879781421e-06, "loss": 0.3837, "num_tokens": 175982141.0, "step": 9665 }, { "epoch": 2.853349070522278, "grad_norm": 0.2729055372437067, "learning_rate": 2.721311475409836e-06, "loss": 0.3931, "num_tokens": 176765665.0, "step": 9670 }, { "epoch": 2.8548244319858367, "grad_norm": 0.24327408953350518, "learning_rate": 2.6939890710382514e-06, "loss": 0.3817, "num_tokens": 177538981.0, "step": 9675 }, { "epoch": 2.856299793449395, "grad_norm": 0.29393551247054817, "learning_rate": 2.666666666666667e-06, "loss": 0.3934, "num_tokens": 178260987.0, "step": 9680 }, { "epoch": 2.8577751549129538, "grad_norm": 0.2655616238244392, "learning_rate": 2.639344262295082e-06, "loss": 0.3962, "num_tokens": 179038448.0, "step": 9685 }, { "epoch": 2.8592505163765125, "grad_norm": 0.2967323223945957, "learning_rate": 2.6120218579234973e-06, "loss": 0.377, "num_tokens": 179765806.0, "step": 9690 }, { "epoch": 2.860725877840071, "grad_norm": 0.33045469778208847, "learning_rate": 2.5846994535519127e-06, "loss": 0.3822, "num_tokens": 180501798.0, "step": 9695 }, { "epoch": 2.862201239303629, "grad_norm": 0.27316443973035015, "learning_rate": 2.5573770491803277e-06, "loss": 0.3708, "num_tokens": 181204867.0, "step": 9700 }, { "epoch": 2.863676600767188, "grad_norm": 0.2551604643802966, "learning_rate": 2.530054644808743e-06, "loss": 0.3909, "num_tokens": 182025099.0, "step": 9705 }, { "epoch": 2.8651519622307466, "grad_norm": 0.2470411485538796, "learning_rate": 2.5027322404371586e-06, "loss": 0.3813, "num_tokens": 182772559.0, "step": 9710 }, { "epoch": 2.866627323694305, "grad_norm": 0.27053568542752465, "learning_rate": 2.4754098360655736e-06, "loss": 0.3977, "num_tokens": 183537140.0, "step": 9715 }, { "epoch": 2.8681026851578637, "grad_norm": 0.26712669912582415, "learning_rate": 2.448087431693989e-06, "loss": 0.3891, "num_tokens": 184336228.0, "step": 9720 }, { "epoch": 2.869578046621422, "grad_norm": 0.24860452018544418, "learning_rate": 2.4207650273224045e-06, "loss": 0.3691, "num_tokens": 185065056.0, "step": 9725 }, { "epoch": 2.8710534080849808, "grad_norm": 0.2696363177867628, "learning_rate": 2.3934426229508195e-06, "loss": 0.3848, "num_tokens": 185820795.0, "step": 9730 }, { "epoch": 2.8725287695485395, "grad_norm": 0.2574531920397094, "learning_rate": 2.366120218579235e-06, "loss": 0.3765, "num_tokens": 186553568.0, "step": 9735 }, { "epoch": 2.874004131012098, "grad_norm": 0.2590419942507942, "learning_rate": 2.3387978142076504e-06, "loss": 0.3839, "num_tokens": 187341536.0, "step": 9740 }, { "epoch": 2.8754794924756566, "grad_norm": 0.2569019047454401, "learning_rate": 2.3114754098360654e-06, "loss": 0.371, "num_tokens": 188083068.0, "step": 9745 }, { "epoch": 2.876954853939215, "grad_norm": 0.25607328063241547, "learning_rate": 2.284153005464481e-06, "loss": 0.3743, "num_tokens": 188805065.0, "step": 9750 }, { "epoch": 2.8784302154027737, "grad_norm": 0.28881847720440373, "learning_rate": 2.2568306010928963e-06, "loss": 0.3996, "num_tokens": 189590342.0, "step": 9755 }, { "epoch": 2.8799055768663324, "grad_norm": 0.27730379248434167, "learning_rate": 2.2295081967213117e-06, "loss": 0.4185, "num_tokens": 190406293.0, "step": 9760 }, { "epoch": 2.8813809383298907, "grad_norm": 0.256600568302344, "learning_rate": 2.2021857923497268e-06, "loss": 0.3972, "num_tokens": 191118130.0, "step": 9765 }, { "epoch": 2.8828562997934495, "grad_norm": 0.26875972508298845, "learning_rate": 2.174863387978142e-06, "loss": 0.3769, "num_tokens": 191833030.0, "step": 9770 }, { "epoch": 2.884331661257008, "grad_norm": 0.2645434296784967, "learning_rate": 2.1475409836065576e-06, "loss": 0.3925, "num_tokens": 192638445.0, "step": 9775 }, { "epoch": 2.8858070227205666, "grad_norm": 0.27131763530724334, "learning_rate": 2.1202185792349727e-06, "loss": 0.392, "num_tokens": 193422212.0, "step": 9780 }, { "epoch": 2.8872823841841253, "grad_norm": 0.26030660188614857, "learning_rate": 2.092896174863388e-06, "loss": 0.3842, "num_tokens": 194133854.0, "step": 9785 }, { "epoch": 2.8887577456476836, "grad_norm": 0.25226715169576436, "learning_rate": 2.0655737704918035e-06, "loss": 0.3723, "num_tokens": 194875367.0, "step": 9790 }, { "epoch": 2.8902331071112424, "grad_norm": 0.2708551097247364, "learning_rate": 2.0382513661202185e-06, "loss": 0.3765, "num_tokens": 195647295.0, "step": 9795 }, { "epoch": 2.8917084685748007, "grad_norm": 0.24767919254486231, "learning_rate": 2.010928961748634e-06, "loss": 0.4089, "num_tokens": 196451877.0, "step": 9800 }, { "epoch": 2.8931838300383594, "grad_norm": 0.27513314929535415, "learning_rate": 1.9836065573770494e-06, "loss": 0.3817, "num_tokens": 197175790.0, "step": 9805 }, { "epoch": 2.894659191501918, "grad_norm": 0.27322737012912823, "learning_rate": 1.9562841530054644e-06, "loss": 0.3983, "num_tokens": 197955103.0, "step": 9810 }, { "epoch": 2.8961345529654765, "grad_norm": 0.2793058509337579, "learning_rate": 1.92896174863388e-06, "loss": 0.3874, "num_tokens": 198643145.0, "step": 9815 }, { "epoch": 2.8976099144290353, "grad_norm": 0.2734643041880439, "learning_rate": 1.9016393442622953e-06, "loss": 0.3726, "num_tokens": 199350322.0, "step": 9820 }, { "epoch": 2.8990852758925936, "grad_norm": 0.24921565168067, "learning_rate": 1.8743169398907103e-06, "loss": 0.379, "num_tokens": 200117003.0, "step": 9825 }, { "epoch": 2.9005606373561523, "grad_norm": 0.2758714739581977, "learning_rate": 1.8469945355191258e-06, "loss": 0.391, "num_tokens": 200883824.0, "step": 9830 }, { "epoch": 2.902035998819711, "grad_norm": 0.284240112400362, "learning_rate": 1.8196721311475412e-06, "loss": 0.383, "num_tokens": 201601146.0, "step": 9835 }, { "epoch": 2.9035113602832694, "grad_norm": 0.26865482745208846, "learning_rate": 1.7923497267759564e-06, "loss": 0.3662, "num_tokens": 202339189.0, "step": 9840 }, { "epoch": 2.9049867217468277, "grad_norm": 0.29174453802153016, "learning_rate": 1.7650273224043717e-06, "loss": 0.3715, "num_tokens": 203088234.0, "step": 9845 }, { "epoch": 2.9064620832103865, "grad_norm": 0.26714103698889613, "learning_rate": 1.7377049180327871e-06, "loss": 0.3746, "num_tokens": 203838852.0, "step": 9850 }, { "epoch": 2.907937444673945, "grad_norm": 0.2625274526113492, "learning_rate": 1.7103825136612023e-06, "loss": 0.4065, "num_tokens": 204626822.0, "step": 9855 }, { "epoch": 2.9094128061375035, "grad_norm": 0.2775829130833783, "learning_rate": 1.6830601092896176e-06, "loss": 0.3797, "num_tokens": 205369282.0, "step": 9860 }, { "epoch": 2.9108881676010623, "grad_norm": 0.28625079332733094, "learning_rate": 1.6557377049180328e-06, "loss": 0.398, "num_tokens": 206141109.0, "step": 9865 }, { "epoch": 2.9123635290646206, "grad_norm": 0.24848179220367686, "learning_rate": 1.6284153005464482e-06, "loss": 0.3952, "num_tokens": 206958725.0, "step": 9870 }, { "epoch": 2.9138388905281793, "grad_norm": 0.2558494124878763, "learning_rate": 1.6010928961748633e-06, "loss": 0.3969, "num_tokens": 207734052.0, "step": 9875 }, { "epoch": 2.915314251991738, "grad_norm": 0.26491945343944, "learning_rate": 1.5737704918032787e-06, "loss": 0.3777, "num_tokens": 208471310.0, "step": 9880 }, { "epoch": 2.9167896134552964, "grad_norm": 0.2660485515461038, "learning_rate": 1.546448087431694e-06, "loss": 0.3833, "num_tokens": 209239034.0, "step": 9885 }, { "epoch": 2.918264974918855, "grad_norm": 0.28426916806372754, "learning_rate": 1.5191256830601094e-06, "loss": 0.4025, "num_tokens": 209957046.0, "step": 9890 }, { "epoch": 2.9197403363824135, "grad_norm": 0.26020935204506485, "learning_rate": 1.4918032786885246e-06, "loss": 0.3789, "num_tokens": 210695225.0, "step": 9895 }, { "epoch": 2.9212156978459722, "grad_norm": 0.26532113513232597, "learning_rate": 1.46448087431694e-06, "loss": 0.3755, "num_tokens": 211371921.0, "step": 9900 }, { "epoch": 2.922691059309531, "grad_norm": 0.27362962921270745, "learning_rate": 1.4371584699453553e-06, "loss": 0.3939, "num_tokens": 212190480.0, "step": 9905 }, { "epoch": 2.9241664207730893, "grad_norm": 0.29400054127474007, "learning_rate": 1.4098360655737705e-06, "loss": 0.3698, "num_tokens": 212900362.0, "step": 9910 }, { "epoch": 2.925641782236648, "grad_norm": 0.25955140203719385, "learning_rate": 1.382513661202186e-06, "loss": 0.4053, "num_tokens": 213710436.0, "step": 9915 }, { "epoch": 2.9271171437002064, "grad_norm": 0.2558207566213842, "learning_rate": 1.3551912568306011e-06, "loss": 0.4009, "num_tokens": 214541409.0, "step": 9920 }, { "epoch": 2.928592505163765, "grad_norm": 0.2484310257356401, "learning_rate": 1.3278688524590164e-06, "loss": 0.4127, "num_tokens": 215366481.0, "step": 9925 }, { "epoch": 2.930067866627324, "grad_norm": 0.2497222644017609, "learning_rate": 1.3005464480874318e-06, "loss": 0.3818, "num_tokens": 216072504.0, "step": 9930 }, { "epoch": 2.931543228090882, "grad_norm": 0.2802055956611306, "learning_rate": 1.273224043715847e-06, "loss": 0.3915, "num_tokens": 216773108.0, "step": 9935 }, { "epoch": 2.933018589554441, "grad_norm": 0.260591475464511, "learning_rate": 1.2459016393442625e-06, "loss": 0.3952, "num_tokens": 217541895.0, "step": 9940 }, { "epoch": 2.9344939510179993, "grad_norm": 0.25348637985695627, "learning_rate": 1.2185792349726777e-06, "loss": 0.3967, "num_tokens": 218343409.0, "step": 9945 }, { "epoch": 2.935969312481558, "grad_norm": 0.25794640678913766, "learning_rate": 1.191256830601093e-06, "loss": 0.3937, "num_tokens": 219113966.0, "step": 9950 }, { "epoch": 2.9374446739451168, "grad_norm": 0.26310773948362065, "learning_rate": 1.1639344262295084e-06, "loss": 0.3757, "num_tokens": 219840052.0, "step": 9955 }, { "epoch": 2.938920035408675, "grad_norm": 0.2849303878707802, "learning_rate": 1.1366120218579236e-06, "loss": 0.3794, "num_tokens": 220543013.0, "step": 9960 }, { "epoch": 2.940395396872234, "grad_norm": 0.2934512599948354, "learning_rate": 1.1092896174863388e-06, "loss": 0.3933, "num_tokens": 221329047.0, "step": 9965 }, { "epoch": 2.941870758335792, "grad_norm": 0.24806009752221952, "learning_rate": 1.0819672131147543e-06, "loss": 0.3791, "num_tokens": 222116331.0, "step": 9970 }, { "epoch": 2.943346119799351, "grad_norm": 0.27319569069415556, "learning_rate": 1.0546448087431695e-06, "loss": 0.3787, "num_tokens": 222857289.0, "step": 9975 }, { "epoch": 2.9448214812629097, "grad_norm": 0.2697229208782104, "learning_rate": 1.027322404371585e-06, "loss": 0.3866, "num_tokens": 223574207.0, "step": 9980 }, { "epoch": 2.946296842726468, "grad_norm": 0.23889808643503394, "learning_rate": 1.0000000000000002e-06, "loss": 0.3701, "num_tokens": 224330379.0, "step": 9985 }, { "epoch": 2.9477722041900263, "grad_norm": 0.26299037418678417, "learning_rate": 9.726775956284154e-07, "loss": 0.4022, "num_tokens": 225126387.0, "step": 9990 }, { "epoch": 2.949247565653585, "grad_norm": 0.27085125548981703, "learning_rate": 9.453551912568307e-07, "loss": 0.3895, "num_tokens": 225940829.0, "step": 9995 }, { "epoch": 2.950722927117144, "grad_norm": 0.2527151405423971, "learning_rate": 9.18032786885246e-07, "loss": 0.397, "num_tokens": 226680088.0, "step": 10000 }, { "epoch": 2.9521982885807025, "grad_norm": 0.275870634506539, "learning_rate": 8.907103825136612e-07, "loss": 0.3748, "num_tokens": 227412077.0, "step": 10005 }, { "epoch": 2.953673650044261, "grad_norm": 0.29270772704808923, "learning_rate": 8.633879781420766e-07, "loss": 0.3794, "num_tokens": 228163867.0, "step": 10010 }, { "epoch": 2.955149011507819, "grad_norm": 0.26204775224935106, "learning_rate": 8.360655737704919e-07, "loss": 0.3978, "num_tokens": 228936423.0, "step": 10015 }, { "epoch": 2.956624372971378, "grad_norm": 0.27330114088234175, "learning_rate": 8.087431693989072e-07, "loss": 0.3848, "num_tokens": 229704950.0, "step": 10020 }, { "epoch": 2.9580997344349367, "grad_norm": 0.3040860539010734, "learning_rate": 7.814207650273224e-07, "loss": 0.4044, "num_tokens": 230436671.0, "step": 10025 }, { "epoch": 2.959575095898495, "grad_norm": 0.2833944956293843, "learning_rate": 7.540983606557377e-07, "loss": 0.3899, "num_tokens": 231214027.0, "step": 10030 }, { "epoch": 2.9610504573620537, "grad_norm": 0.2702041632755358, "learning_rate": 7.26775956284153e-07, "loss": 0.3864, "num_tokens": 231932754.0, "step": 10035 }, { "epoch": 2.962525818825612, "grad_norm": 0.2528203134184064, "learning_rate": 6.994535519125683e-07, "loss": 0.3988, "num_tokens": 232686632.0, "step": 10040 }, { "epoch": 2.964001180289171, "grad_norm": 0.2674425794479048, "learning_rate": 6.721311475409836e-07, "loss": 0.4055, "num_tokens": 233430664.0, "step": 10045 }, { "epoch": 2.9654765417527296, "grad_norm": 0.2730151357056106, "learning_rate": 6.44808743169399e-07, "loss": 0.3942, "num_tokens": 234177837.0, "step": 10050 }, { "epoch": 2.966951903216288, "grad_norm": 0.26963096671083614, "learning_rate": 6.174863387978142e-07, "loss": 0.3861, "num_tokens": 234994330.0, "step": 10055 }, { "epoch": 2.9684272646798466, "grad_norm": 0.27361028294325973, "learning_rate": 5.901639344262295e-07, "loss": 0.3774, "num_tokens": 235673539.0, "step": 10060 }, { "epoch": 2.969902626143405, "grad_norm": 0.25075948734220405, "learning_rate": 5.628415300546449e-07, "loss": 0.381, "num_tokens": 236419580.0, "step": 10065 }, { "epoch": 2.9713779876069637, "grad_norm": 0.26629335033501655, "learning_rate": 5.355191256830602e-07, "loss": 0.3863, "num_tokens": 237191549.0, "step": 10070 }, { "epoch": 2.9728533490705225, "grad_norm": 0.2632384720929498, "learning_rate": 5.081967213114754e-07, "loss": 0.3856, "num_tokens": 237977426.0, "step": 10075 }, { "epoch": 2.9743287105340808, "grad_norm": 0.2842058126692916, "learning_rate": 4.808743169398908e-07, "loss": 0.3936, "num_tokens": 238695183.0, "step": 10080 }, { "epoch": 2.9758040719976395, "grad_norm": 0.24435978447107742, "learning_rate": 4.5355191256830604e-07, "loss": 0.4065, "num_tokens": 239493674.0, "step": 10085 }, { "epoch": 2.977279433461198, "grad_norm": 0.28447763490946265, "learning_rate": 4.262295081967214e-07, "loss": 0.3903, "num_tokens": 240257664.0, "step": 10090 }, { "epoch": 2.9787547949247566, "grad_norm": 0.2430446059317, "learning_rate": 3.989071038251366e-07, "loss": 0.3826, "num_tokens": 241017975.0, "step": 10095 }, { "epoch": 2.9802301563883153, "grad_norm": 0.30514580693586446, "learning_rate": 3.7158469945355194e-07, "loss": 0.4149, "num_tokens": 241826782.0, "step": 10100 }, { "epoch": 2.9817055178518737, "grad_norm": 0.24646145901557787, "learning_rate": 3.4426229508196727e-07, "loss": 0.3736, "num_tokens": 242587720.0, "step": 10105 }, { "epoch": 2.9831808793154324, "grad_norm": 0.27135200210206606, "learning_rate": 3.1693989071038255e-07, "loss": 0.402, "num_tokens": 243380297.0, "step": 10110 }, { "epoch": 2.9846562407789907, "grad_norm": 0.26140505931930397, "learning_rate": 2.8961748633879783e-07, "loss": 0.3892, "num_tokens": 244167650.0, "step": 10115 }, { "epoch": 2.9861316022425495, "grad_norm": 0.2681057034479873, "learning_rate": 2.622950819672131e-07, "loss": 0.3921, "num_tokens": 244913971.0, "step": 10120 }, { "epoch": 2.9876069637061082, "grad_norm": 0.26922468991120996, "learning_rate": 2.3497267759562845e-07, "loss": 0.3939, "num_tokens": 245678589.0, "step": 10125 }, { "epoch": 2.9890823251696665, "grad_norm": 0.2782643540607677, "learning_rate": 2.0765027322404373e-07, "loss": 0.4093, "num_tokens": 246495932.0, "step": 10130 }, { "epoch": 2.9905576866332253, "grad_norm": 0.28672527807996134, "learning_rate": 1.8032786885245904e-07, "loss": 0.3754, "num_tokens": 247185402.0, "step": 10135 }, { "epoch": 2.9920330480967836, "grad_norm": 0.2668283064788114, "learning_rate": 1.5300546448087432e-07, "loss": 0.3727, "num_tokens": 247911736.0, "step": 10140 }, { "epoch": 2.9935084095603424, "grad_norm": 0.275957337093494, "learning_rate": 1.2568306010928962e-07, "loss": 0.3936, "num_tokens": 248664679.0, "step": 10145 }, { "epoch": 2.994983771023901, "grad_norm": 0.2462387927697041, "learning_rate": 9.836065573770492e-08, "loss": 0.3886, "num_tokens": 249464123.0, "step": 10150 }, { "epoch": 2.9964591324874594, "grad_norm": 0.2660884665924791, "learning_rate": 7.103825136612023e-08, "loss": 0.4034, "num_tokens": 250277236.0, "step": 10155 }, { "epoch": 2.9979344939510177, "grad_norm": 0.2674269356857843, "learning_rate": 4.371584699453552e-08, "loss": 0.3903, "num_tokens": 251045265.0, "step": 10160 }, { "epoch": 2.9994098554145765, "grad_norm": 0.2711755703596401, "learning_rate": 1.639344262295082e-08, "loss": 0.406, "num_tokens": 251852396.0, "step": 10165 }, { "epoch": 3.0, "num_tokens": 252168847.0, "step": 10167, "total_flos": 3.0485438709270315e+19, "train_loss": 0.06383175736532645, "train_runtime": 10917.8223, "train_samples_per_second": 11.174, "train_steps_per_second": 0.931 } ], "logging_steps": 5, "max_steps": 10167, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0485438709270315e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }