diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4731 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2930, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017064846416382253, + "grad_norm": 2.823172429776875, + "learning_rate": 1.360544217687075e-06, + "loss": 0.8715, + "num_tokens": 949756.0, + "step": 5 + }, + { + "epoch": 0.0034129692832764505, + "grad_norm": 1.9356174655366971, + "learning_rate": 3.0612244897959185e-06, + "loss": 0.8609, + "num_tokens": 1934239.0, + "step": 10 + }, + { + "epoch": 0.005119453924914676, + "grad_norm": 1.460519264059952, + "learning_rate": 4.7619047619047615e-06, + "loss": 0.794, + "num_tokens": 2810536.0, + "step": 15 + }, + { + "epoch": 0.006825938566552901, + "grad_norm": 0.904578522693587, + "learning_rate": 6.462585034013606e-06, + "loss": 0.7436, + "num_tokens": 3759778.0, + "step": 20 + }, + { + "epoch": 0.008532423208191127, + "grad_norm": 0.7983479886595664, + "learning_rate": 8.163265306122448e-06, + "loss": 0.7124, + "num_tokens": 4719221.0, + "step": 25 + }, + { + "epoch": 0.010238907849829351, + "grad_norm": 0.7054491846301127, + "learning_rate": 9.863945578231292e-06, + "loss": 0.7244, + "num_tokens": 5645472.0, + "step": 30 + }, + { + "epoch": 0.011945392491467578, + "grad_norm": 0.5846860059632117, + "learning_rate": 1.1564625850340138e-05, + "loss": 0.6856, + "num_tokens": 6675650.0, + "step": 35 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 0.5252998046067419, + "learning_rate": 1.3265306122448982e-05, + "loss": 0.6694, + "num_tokens": 7670069.0, + "step": 40 + }, + { + "epoch": 0.015358361774744027, + "grad_norm": 0.5533849135779766, + "learning_rate": 1.4965986394557824e-05, + "loss": 0.6436, + "num_tokens": 8562223.0, + "step": 45 + }, + { + "epoch": 0.017064846416382253, + "grad_norm": 0.5631922874607858, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.6546, + "num_tokens": 9514590.0, + "step": 50 + }, + { + "epoch": 0.01877133105802048, + "grad_norm": 0.6078968314204046, + "learning_rate": 1.836734693877551e-05, + "loss": 0.6175, + "num_tokens": 10423653.0, + "step": 55 + }, + { + "epoch": 0.020477815699658702, + "grad_norm": 0.5693094869748001, + "learning_rate": 2.0068027210884355e-05, + "loss": 0.6177, + "num_tokens": 11370767.0, + "step": 60 + }, + { + "epoch": 0.02218430034129693, + "grad_norm": 0.6043968806478303, + "learning_rate": 2.17687074829932e-05, + "loss": 0.6241, + "num_tokens": 12205003.0, + "step": 65 + }, + { + "epoch": 0.023890784982935155, + "grad_norm": 0.6098681736085941, + "learning_rate": 2.3469387755102043e-05, + "loss": 0.6197, + "num_tokens": 13221634.0, + "step": 70 + }, + { + "epoch": 0.025597269624573378, + "grad_norm": 0.5652622276816748, + "learning_rate": 2.5170068027210887e-05, + "loss": 0.6032, + "num_tokens": 14132790.0, + "step": 75 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 0.5925207051821937, + "learning_rate": 2.687074829931973e-05, + "loss": 0.6095, + "num_tokens": 15120586.0, + "step": 80 + }, + { + "epoch": 0.02901023890784983, + "grad_norm": 0.6004251741348174, + "learning_rate": 2.857142857142857e-05, + "loss": 0.627, + "num_tokens": 16123604.0, + "step": 85 + }, + { + "epoch": 0.030716723549488054, + "grad_norm": 0.6116000919042185, + "learning_rate": 3.0272108843537418e-05, + "loss": 0.6003, + "num_tokens": 17045024.0, + "step": 90 + }, + { + "epoch": 0.032423208191126277, + "grad_norm": 0.6033555839102962, + "learning_rate": 3.1972789115646265e-05, + "loss": 0.6099, + "num_tokens": 17979516.0, + "step": 95 + }, + { + "epoch": 0.034129692832764506, + "grad_norm": 0.6174109618196879, + "learning_rate": 3.36734693877551e-05, + "loss": 0.5928, + "num_tokens": 18909989.0, + "step": 100 + }, + { + "epoch": 0.03583617747440273, + "grad_norm": 0.6562250628577161, + "learning_rate": 3.5374149659863946e-05, + "loss": 0.6091, + "num_tokens": 19893037.0, + "step": 105 + }, + { + "epoch": 0.03754266211604096, + "grad_norm": 0.6359940327448688, + "learning_rate": 3.707482993197279e-05, + "loss": 0.5757, + "num_tokens": 20794568.0, + "step": 110 + }, + { + "epoch": 0.03924914675767918, + "grad_norm": 0.591968085837116, + "learning_rate": 3.8775510204081634e-05, + "loss": 0.601, + "num_tokens": 21708539.0, + "step": 115 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 0.6284093561745501, + "learning_rate": 4.047619047619048e-05, + "loss": 0.6047, + "num_tokens": 22645536.0, + "step": 120 + }, + { + "epoch": 0.042662116040955635, + "grad_norm": 0.622896210775772, + "learning_rate": 4.217687074829932e-05, + "loss": 0.5941, + "num_tokens": 23599448.0, + "step": 125 + }, + { + "epoch": 0.04436860068259386, + "grad_norm": 0.5576038080368834, + "learning_rate": 4.387755102040816e-05, + "loss": 0.6178, + "num_tokens": 24609009.0, + "step": 130 + }, + { + "epoch": 0.04607508532423208, + "grad_norm": 0.5507377217724624, + "learning_rate": 4.557823129251701e-05, + "loss": 0.6048, + "num_tokens": 25618632.0, + "step": 135 + }, + { + "epoch": 0.04778156996587031, + "grad_norm": 0.6383078614695583, + "learning_rate": 4.7278911564625856e-05, + "loss": 0.5975, + "num_tokens": 26641035.0, + "step": 140 + }, + { + "epoch": 0.04948805460750853, + "grad_norm": 0.6846737722897228, + "learning_rate": 4.89795918367347e-05, + "loss": 0.5744, + "num_tokens": 27517779.0, + "step": 145 + }, + { + "epoch": 0.051194539249146756, + "grad_norm": 0.7663095587321149, + "learning_rate": 4.999994265630655e-05, + "loss": 0.5675, + "num_tokens": 28445983.0, + "step": 150 + }, + { + "epoch": 0.052901023890784986, + "grad_norm": 0.6816635800011499, + "learning_rate": 4.999929754311198e-05, + "loss": 0.5903, + "num_tokens": 29489865.0, + "step": 155 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 0.6680964778036098, + "learning_rate": 4.999793565772626e-05, + "loss": 0.5989, + "num_tokens": 30423007.0, + "step": 160 + }, + { + "epoch": 0.05631399317406143, + "grad_norm": 0.5649434202104989, + "learning_rate": 4.999585704353568e-05, + "loss": 0.5801, + "num_tokens": 31372257.0, + "step": 165 + }, + { + "epoch": 0.05802047781569966, + "grad_norm": 0.5122745336387098, + "learning_rate": 4.999306176675979e-05, + "loss": 0.5998, + "num_tokens": 32356676.0, + "step": 170 + }, + { + "epoch": 0.059726962457337884, + "grad_norm": 0.5332799116717496, + "learning_rate": 4.998954991644921e-05, + "loss": 0.5904, + "num_tokens": 33261796.0, + "step": 175 + }, + { + "epoch": 0.06143344709897611, + "grad_norm": 0.5068363705703531, + "learning_rate": 4.9985321604482835e-05, + "loss": 0.59, + "num_tokens": 34237001.0, + "step": 180 + }, + { + "epoch": 0.06313993174061433, + "grad_norm": 0.6492409848966394, + "learning_rate": 4.9980376965564286e-05, + "loss": 0.5955, + "num_tokens": 35167253.0, + "step": 185 + }, + { + "epoch": 0.06484641638225255, + "grad_norm": 0.5820124301992429, + "learning_rate": 4.997471615721756e-05, + "loss": 0.5767, + "num_tokens": 36074352.0, + "step": 190 + }, + { + "epoch": 0.06655290102389079, + "grad_norm": 0.5394095878459838, + "learning_rate": 4.996833935978207e-05, + "loss": 0.624, + "num_tokens": 37055456.0, + "step": 195 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 0.5362539000244216, + "learning_rate": 4.996124677640687e-05, + "loss": 0.5722, + "num_tokens": 37967720.0, + "step": 200 + }, + { + "epoch": 0.06996587030716724, + "grad_norm": 0.49110498514505857, + "learning_rate": 4.99534386330442e-05, + "loss": 0.5993, + "num_tokens": 38924279.0, + "step": 205 + }, + { + "epoch": 0.07167235494880546, + "grad_norm": 0.546699809294802, + "learning_rate": 4.994491517844227e-05, + "loss": 0.5938, + "num_tokens": 39853351.0, + "step": 210 + }, + { + "epoch": 0.07337883959044368, + "grad_norm": 0.5776918031520784, + "learning_rate": 4.993567668413733e-05, + "loss": 0.5809, + "num_tokens": 40811069.0, + "step": 215 + }, + { + "epoch": 0.07508532423208192, + "grad_norm": 0.4951200074682594, + "learning_rate": 4.992572344444507e-05, + "loss": 0.6027, + "num_tokens": 41833783.0, + "step": 220 + }, + { + "epoch": 0.07679180887372014, + "grad_norm": 0.5901419308378143, + "learning_rate": 4.991505577645118e-05, + "loss": 0.5747, + "num_tokens": 42744744.0, + "step": 225 + }, + { + "epoch": 0.07849829351535836, + "grad_norm": 0.5304430248970768, + "learning_rate": 4.9903674020001284e-05, + "loss": 0.6, + "num_tokens": 43682290.0, + "step": 230 + }, + { + "epoch": 0.08020477815699659, + "grad_norm": 0.5211597930937201, + "learning_rate": 4.989157853769009e-05, + "loss": 0.5805, + "num_tokens": 44609387.0, + "step": 235 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 0.5133063136328732, + "learning_rate": 4.987876971484988e-05, + "loss": 0.5787, + "num_tokens": 45550959.0, + "step": 240 + }, + { + "epoch": 0.08361774744027303, + "grad_norm": 0.6095708293579054, + "learning_rate": 4.9865247959538194e-05, + "loss": 0.5976, + "num_tokens": 46433321.0, + "step": 245 + }, + { + "epoch": 0.08532423208191127, + "grad_norm": 0.4814034781564727, + "learning_rate": 4.985101370252483e-05, + "loss": 0.5872, + "num_tokens": 47474526.0, + "step": 250 + }, + { + "epoch": 0.08703071672354949, + "grad_norm": 0.5162969780823659, + "learning_rate": 4.983606739727816e-05, + "loss": 0.5863, + "num_tokens": 48425336.0, + "step": 255 + }, + { + "epoch": 0.08873720136518772, + "grad_norm": 0.5073448552205259, + "learning_rate": 4.982040951995066e-05, + "loss": 0.5821, + "num_tokens": 49377672.0, + "step": 260 + }, + { + "epoch": 0.09044368600682594, + "grad_norm": 0.6390759679559125, + "learning_rate": 4.980404056936371e-05, + "loss": 0.5822, + "num_tokens": 50297482.0, + "step": 265 + }, + { + "epoch": 0.09215017064846416, + "grad_norm": 0.505488369723625, + "learning_rate": 4.978696106699175e-05, + "loss": 0.5777, + "num_tokens": 51201531.0, + "step": 270 + }, + { + "epoch": 0.09385665529010238, + "grad_norm": 0.4834540354696749, + "learning_rate": 4.976917155694565e-05, + "loss": 0.5706, + "num_tokens": 52135060.0, + "step": 275 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 0.5091081042854496, + "learning_rate": 4.9750672605955385e-05, + "loss": 0.5887, + "num_tokens": 53019691.0, + "step": 280 + }, + { + "epoch": 0.09726962457337884, + "grad_norm": 0.5272029644773706, + "learning_rate": 4.9731464803351944e-05, + "loss": 0.5768, + "num_tokens": 54021567.0, + "step": 285 + }, + { + "epoch": 0.09897610921501707, + "grad_norm": 0.5618712865310092, + "learning_rate": 4.971154876104862e-05, + "loss": 0.5707, + "num_tokens": 54928756.0, + "step": 290 + }, + { + "epoch": 0.10068259385665529, + "grad_norm": 0.5079662485961427, + "learning_rate": 4.969092511352143e-05, + "loss": 0.5764, + "num_tokens": 55901721.0, + "step": 295 + }, + { + "epoch": 0.10238907849829351, + "grad_norm": 0.5590533288223599, + "learning_rate": 4.9669594517789004e-05, + "loss": 0.6059, + "num_tokens": 56919336.0, + "step": 300 + }, + { + "epoch": 0.10409556313993173, + "grad_norm": 0.5124289657226824, + "learning_rate": 4.9647557653391544e-05, + "loss": 0.5486, + "num_tokens": 57859693.0, + "step": 305 + }, + { + "epoch": 0.10580204778156997, + "grad_norm": 0.5236808101484978, + "learning_rate": 4.9624815222369283e-05, + "loss": 0.5744, + "num_tokens": 58769487.0, + "step": 310 + }, + { + "epoch": 0.1075085324232082, + "grad_norm": 0.41840286703818447, + "learning_rate": 4.9601367949240034e-05, + "loss": 0.5571, + "num_tokens": 59686216.0, + "step": 315 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 0.49183814869408765, + "learning_rate": 4.957721658097616e-05, + "loss": 0.5778, + "num_tokens": 60660649.0, + "step": 320 + }, + { + "epoch": 0.11092150170648464, + "grad_norm": 0.5261379848235058, + "learning_rate": 4.955236188698076e-05, + "loss": 0.5581, + "num_tokens": 61640951.0, + "step": 325 + }, + { + "epoch": 0.11262798634812286, + "grad_norm": 0.5047161464074051, + "learning_rate": 4.9526804659063135e-05, + "loss": 0.5453, + "num_tokens": 62673288.0, + "step": 330 + }, + { + "epoch": 0.11433447098976109, + "grad_norm": 0.5370013069114835, + "learning_rate": 4.950054571141362e-05, + "loss": 0.5704, + "num_tokens": 63654789.0, + "step": 335 + }, + { + "epoch": 0.11604095563139932, + "grad_norm": 0.5065504794071035, + "learning_rate": 4.94735858805776e-05, + "loss": 0.5564, + "num_tokens": 64575366.0, + "step": 340 + }, + { + "epoch": 0.11774744027303755, + "grad_norm": 0.5367370357250334, + "learning_rate": 4.9445926025428856e-05, + "loss": 0.5682, + "num_tokens": 65572577.0, + "step": 345 + }, + { + "epoch": 0.11945392491467577, + "grad_norm": 0.6256624388445353, + "learning_rate": 4.9417567027142245e-05, + "loss": 0.5691, + "num_tokens": 66496209.0, + "step": 350 + }, + { + "epoch": 0.12116040955631399, + "grad_norm": 0.5364526247895124, + "learning_rate": 4.938850978916557e-05, + "loss": 0.5963, + "num_tokens": 67477381.0, + "step": 355 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 0.5760517333994615, + "learning_rate": 4.935875523719086e-05, + "loss": 0.5676, + "num_tokens": 68438217.0, + "step": 360 + }, + { + "epoch": 0.12457337883959044, + "grad_norm": 0.4912183552600293, + "learning_rate": 4.932830431912484e-05, + "loss": 0.5689, + "num_tokens": 69455969.0, + "step": 365 + }, + { + "epoch": 0.12627986348122866, + "grad_norm": 0.4760184495750453, + "learning_rate": 4.929715800505873e-05, + "loss": 0.5763, + "num_tokens": 70344364.0, + "step": 370 + }, + { + "epoch": 0.12798634812286688, + "grad_norm": 0.45385512480590845, + "learning_rate": 4.926531728723738e-05, + "loss": 0.5871, + "num_tokens": 71311780.0, + "step": 375 + }, + { + "epoch": 0.1296928327645051, + "grad_norm": 0.4698082651802184, + "learning_rate": 4.923278318002761e-05, + "loss": 0.5545, + "num_tokens": 72264786.0, + "step": 380 + }, + { + "epoch": 0.13139931740614336, + "grad_norm": 0.45414350096683564, + "learning_rate": 4.919955671988592e-05, + "loss": 0.5368, + "num_tokens": 73254319.0, + "step": 385 + }, + { + "epoch": 0.13310580204778158, + "grad_norm": 0.4488305223658382, + "learning_rate": 4.916563896532549e-05, + "loss": 0.5538, + "num_tokens": 74233642.0, + "step": 390 + }, + { + "epoch": 0.1348122866894198, + "grad_norm": 0.4525201301734097, + "learning_rate": 4.91310309968824e-05, + "loss": 0.5689, + "num_tokens": 75216559.0, + "step": 395 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 0.5580205197947308, + "learning_rate": 4.90957339170813e-05, + "loss": 0.5684, + "num_tokens": 76173511.0, + "step": 400 + }, + { + "epoch": 0.13822525597269625, + "grad_norm": 0.43833715671179474, + "learning_rate": 4.905974885040015e-05, + "loss": 0.5537, + "num_tokens": 77137128.0, + "step": 405 + }, + { + "epoch": 0.13993174061433447, + "grad_norm": 0.43081913118053056, + "learning_rate": 4.902307694323456e-05, + "loss": 0.5595, + "num_tokens": 78183541.0, + "step": 410 + }, + { + "epoch": 0.1416382252559727, + "grad_norm": 0.46606853464523745, + "learning_rate": 4.8985719363861135e-05, + "loss": 0.572, + "num_tokens": 79163656.0, + "step": 415 + }, + { + "epoch": 0.14334470989761092, + "grad_norm": 0.5472361464183433, + "learning_rate": 4.8947677302400326e-05, + "loss": 0.5522, + "num_tokens": 80166162.0, + "step": 420 + }, + { + "epoch": 0.14505119453924914, + "grad_norm": 0.4537573554760025, + "learning_rate": 4.890895197077848e-05, + "loss": 0.5507, + "num_tokens": 81121834.0, + "step": 425 + }, + { + "epoch": 0.14675767918088736, + "grad_norm": 0.5290898874704273, + "learning_rate": 4.886954460268927e-05, + "loss": 0.5702, + "num_tokens": 81987283.0, + "step": 430 + }, + { + "epoch": 0.14846416382252559, + "grad_norm": 0.45456782141151, + "learning_rate": 4.882945645355435e-05, + "loss": 0.5756, + "num_tokens": 82994121.0, + "step": 435 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 0.4809659734303757, + "learning_rate": 4.878868880048341e-05, + "loss": 0.5614, + "num_tokens": 83915537.0, + "step": 440 + }, + { + "epoch": 0.15187713310580206, + "grad_norm": 0.52139675211743, + "learning_rate": 4.874724294223343e-05, + "loss": 0.5444, + "num_tokens": 84755157.0, + "step": 445 + }, + { + "epoch": 0.15358361774744028, + "grad_norm": 0.4544341064553721, + "learning_rate": 4.870512019916734e-05, + "loss": 0.5433, + "num_tokens": 85690047.0, + "step": 450 + }, + { + "epoch": 0.1552901023890785, + "grad_norm": 0.4558960340950479, + "learning_rate": 4.866232191321199e-05, + "loss": 0.5933, + "num_tokens": 86736902.0, + "step": 455 + }, + { + "epoch": 0.15699658703071673, + "grad_norm": 0.47990907617604506, + "learning_rate": 4.8618849447815305e-05, + "loss": 0.5745, + "num_tokens": 87705484.0, + "step": 460 + }, + { + "epoch": 0.15870307167235495, + "grad_norm": 0.4471349487997525, + "learning_rate": 4.8574704187902955e-05, + "loss": 0.5493, + "num_tokens": 88652585.0, + "step": 465 + }, + { + "epoch": 0.16040955631399317, + "grad_norm": 0.5050821167458766, + "learning_rate": 4.8529887539834144e-05, + "loss": 0.5559, + "num_tokens": 89594618.0, + "step": 470 + }, + { + "epoch": 0.1621160409556314, + "grad_norm": 0.433008399283799, + "learning_rate": 4.84844009313569e-05, + "loss": 0.5465, + "num_tokens": 90594832.0, + "step": 475 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 0.42424498611581374, + "learning_rate": 4.843824581156249e-05, + "loss": 0.5555, + "num_tokens": 91557529.0, + "step": 480 + }, + { + "epoch": 0.16552901023890784, + "grad_norm": 0.4536724303374971, + "learning_rate": 4.839142365083932e-05, + "loss": 0.5586, + "num_tokens": 92520759.0, + "step": 485 + }, + { + "epoch": 0.16723549488054607, + "grad_norm": 0.4135691134510148, + "learning_rate": 4.8343935940826104e-05, + "loss": 0.5463, + "num_tokens": 93508244.0, + "step": 490 + }, + { + "epoch": 0.1689419795221843, + "grad_norm": 0.49925946717984443, + "learning_rate": 4.829578419436427e-05, + "loss": 0.5758, + "num_tokens": 94489856.0, + "step": 495 + }, + { + "epoch": 0.17064846416382254, + "grad_norm": 0.4795792461133882, + "learning_rate": 4.824696994544985e-05, + "loss": 0.5581, + "num_tokens": 95453893.0, + "step": 500 + }, + { + "epoch": 0.17235494880546076, + "grad_norm": 0.46669919948991945, + "learning_rate": 4.819749474918455e-05, + "loss": 0.556, + "num_tokens": 96333287.0, + "step": 505 + }, + { + "epoch": 0.17406143344709898, + "grad_norm": 0.4707277196611808, + "learning_rate": 4.814736018172624e-05, + "loss": 0.5583, + "num_tokens": 97319183.0, + "step": 510 + }, + { + "epoch": 0.1757679180887372, + "grad_norm": 0.44856398131453706, + "learning_rate": 4.809656784023872e-05, + "loss": 0.5643, + "num_tokens": 98374455.0, + "step": 515 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 0.41999315891743993, + "learning_rate": 4.8045119342840885e-05, + "loss": 0.5368, + "num_tokens": 99400041.0, + "step": 520 + }, + { + "epoch": 0.17918088737201365, + "grad_norm": 0.4314801891904282, + "learning_rate": 4.799301632855508e-05, + "loss": 0.5682, + "num_tokens": 100419726.0, + "step": 525 + }, + { + "epoch": 0.18088737201365188, + "grad_norm": 0.4532725946713624, + "learning_rate": 4.794026045725501e-05, + "loss": 0.5413, + "num_tokens": 101373875.0, + "step": 530 + }, + { + "epoch": 0.1825938566552901, + "grad_norm": 0.45204637278231036, + "learning_rate": 4.788685340961276e-05, + "loss": 0.5561, + "num_tokens": 102331658.0, + "step": 535 + }, + { + "epoch": 0.18430034129692832, + "grad_norm": 0.4711186304154966, + "learning_rate": 4.7832796887045276e-05, + "loss": 0.5421, + "num_tokens": 103240516.0, + "step": 540 + }, + { + "epoch": 0.18600682593856654, + "grad_norm": 0.4626002965808097, + "learning_rate": 4.7778092611660225e-05, + "loss": 0.5696, + "num_tokens": 104162605.0, + "step": 545 + }, + { + "epoch": 0.18771331058020477, + "grad_norm": 0.423630300008881, + "learning_rate": 4.772274232620104e-05, + "loss": 0.5532, + "num_tokens": 105061908.0, + "step": 550 + }, + { + "epoch": 0.189419795221843, + "grad_norm": 0.5303058647014298, + "learning_rate": 4.766674779399145e-05, + "loss": 0.5634, + "num_tokens": 105919969.0, + "step": 555 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 0.4528277982211153, + "learning_rate": 4.76101107988793e-05, + "loss": 0.5775, + "num_tokens": 106919294.0, + "step": 560 + }, + { + "epoch": 0.19283276450511946, + "grad_norm": 0.375604144309103, + "learning_rate": 4.7552833145179746e-05, + "loss": 0.5127, + "num_tokens": 107846976.0, + "step": 565 + }, + { + "epoch": 0.1945392491467577, + "grad_norm": 0.44488024598088494, + "learning_rate": 4.749491665761772e-05, + "loss": 0.5388, + "num_tokens": 108819219.0, + "step": 570 + }, + { + "epoch": 0.1962457337883959, + "grad_norm": 0.4154713260123601, + "learning_rate": 4.7436363181269825e-05, + "loss": 0.5469, + "num_tokens": 109845258.0, + "step": 575 + }, + { + "epoch": 0.19795221843003413, + "grad_norm": 0.39816353681189776, + "learning_rate": 4.737717458150558e-05, + "loss": 0.5519, + "num_tokens": 110858993.0, + "step": 580 + }, + { + "epoch": 0.19965870307167236, + "grad_norm": 0.44831838753632824, + "learning_rate": 4.7317352743927954e-05, + "loss": 0.5578, + "num_tokens": 111788546.0, + "step": 585 + }, + { + "epoch": 0.20136518771331058, + "grad_norm": 0.4186460516616093, + "learning_rate": 4.7256899574313304e-05, + "loss": 0.5472, + "num_tokens": 112732095.0, + "step": 590 + }, + { + "epoch": 0.2030716723549488, + "grad_norm": 0.44237132476023605, + "learning_rate": 4.71958169985507e-05, + "loss": 0.5493, + "num_tokens": 113649022.0, + "step": 595 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 0.43701302963747873, + "learning_rate": 4.7134106962580516e-05, + "loss": 0.5569, + "num_tokens": 114540376.0, + "step": 600 + }, + { + "epoch": 0.20648464163822525, + "grad_norm": 0.4128633428095234, + "learning_rate": 4.707177143233247e-05, + "loss": 0.5513, + "num_tokens": 115480997.0, + "step": 605 + }, + { + "epoch": 0.20819112627986347, + "grad_norm": 0.3755079914854989, + "learning_rate": 4.7008812393662996e-05, + "loss": 0.5255, + "num_tokens": 116464215.0, + "step": 610 + }, + { + "epoch": 0.2098976109215017, + "grad_norm": 0.4092404736922847, + "learning_rate": 4.694523185229196e-05, + "loss": 0.5398, + "num_tokens": 117413382.0, + "step": 615 + }, + { + "epoch": 0.21160409556313994, + "grad_norm": 0.4130984750062132, + "learning_rate": 4.688103183373877e-05, + "loss": 0.5355, + "num_tokens": 118465258.0, + "step": 620 + }, + { + "epoch": 0.21331058020477817, + "grad_norm": 0.4350871959699708, + "learning_rate": 4.6816214383257864e-05, + "loss": 0.5507, + "num_tokens": 119368272.0, + "step": 625 + }, + { + "epoch": 0.2150170648464164, + "grad_norm": 0.42712415977098717, + "learning_rate": 4.6750781565773524e-05, + "loss": 0.5376, + "num_tokens": 120323497.0, + "step": 630 + }, + { + "epoch": 0.2167235494880546, + "grad_norm": 0.4018174070587826, + "learning_rate": 4.6684735465814114e-05, + "loss": 0.5623, + "num_tokens": 121336091.0, + "step": 635 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 0.40967629179586657, + "learning_rate": 4.661807818744568e-05, + "loss": 0.5345, + "num_tokens": 122331818.0, + "step": 640 + }, + { + "epoch": 0.22013651877133106, + "grad_norm": 0.4516013972993547, + "learning_rate": 4.6550811854204896e-05, + "loss": 0.545, + "num_tokens": 123276577.0, + "step": 645 + }, + { + "epoch": 0.22184300341296928, + "grad_norm": 0.4167635858598471, + "learning_rate": 4.6482938609031406e-05, + "loss": 0.5574, + "num_tokens": 124260967.0, + "step": 650 + }, + { + "epoch": 0.2235494880546075, + "grad_norm": 0.4027183249033178, + "learning_rate": 4.6414460614199614e-05, + "loss": 0.558, + "num_tokens": 125178584.0, + "step": 655 + }, + { + "epoch": 0.22525597269624573, + "grad_norm": 0.3943619941232963, + "learning_rate": 4.6345380051249726e-05, + "loss": 0.5359, + "num_tokens": 126115279.0, + "step": 660 + }, + { + "epoch": 0.22696245733788395, + "grad_norm": 0.41605588299949164, + "learning_rate": 4.627569912091829e-05, + "loss": 0.5308, + "num_tokens": 127123510.0, + "step": 665 + }, + { + "epoch": 0.22866894197952217, + "grad_norm": 0.4232494441201061, + "learning_rate": 4.620542004306808e-05, + "loss": 0.5291, + "num_tokens": 128096244.0, + "step": 670 + }, + { + "epoch": 0.23037542662116042, + "grad_norm": 0.36709719294748555, + "learning_rate": 4.613454505661738e-05, + "loss": 0.545, + "num_tokens": 129070712.0, + "step": 675 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 0.4284861781533593, + "learning_rate": 4.606307641946867e-05, + "loss": 0.5639, + "num_tokens": 129992439.0, + "step": 680 + }, + { + "epoch": 0.23378839590443687, + "grad_norm": 0.4278163699139823, + "learning_rate": 4.599101640843664e-05, + "loss": 0.539, + "num_tokens": 130917322.0, + "step": 685 + }, + { + "epoch": 0.2354948805460751, + "grad_norm": 0.3880253078166962, + "learning_rate": 4.591836731917573e-05, + "loss": 0.5683, + "num_tokens": 131869001.0, + "step": 690 + }, + { + "epoch": 0.23720136518771331, + "grad_norm": 0.48519543443409835, + "learning_rate": 4.584513146610694e-05, + "loss": 0.5578, + "num_tokens": 132871820.0, + "step": 695 + }, + { + "epoch": 0.23890784982935154, + "grad_norm": 0.4331997241836761, + "learning_rate": 4.577131118234413e-05, + "loss": 0.5642, + "num_tokens": 133787994.0, + "step": 700 + }, + { + "epoch": 0.24061433447098976, + "grad_norm": 0.4124588160404072, + "learning_rate": 4.569690881961967e-05, + "loss": 0.531, + "num_tokens": 134665258.0, + "step": 705 + }, + { + "epoch": 0.24232081911262798, + "grad_norm": 0.40688160744028257, + "learning_rate": 4.562192674820957e-05, + "loss": 0.536, + "num_tokens": 135563718.0, + "step": 710 + }, + { + "epoch": 0.2440273037542662, + "grad_norm": 0.3855522562994955, + "learning_rate": 4.554636735685786e-05, + "loss": 0.5366, + "num_tokens": 136530274.0, + "step": 715 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 0.4249521897707511, + "learning_rate": 4.547023305270064e-05, + "loss": 0.5475, + "num_tokens": 137544925.0, + "step": 720 + }, + { + "epoch": 0.24744027303754265, + "grad_norm": 0.6254856649079702, + "learning_rate": 4.539352626118926e-05, + "loss": 0.5417, + "num_tokens": 138475799.0, + "step": 725 + }, + { + "epoch": 0.24914675767918087, + "grad_norm": 0.3911156875271076, + "learning_rate": 4.5316249426013126e-05, + "loss": 0.5201, + "num_tokens": 139435802.0, + "step": 730 + }, + { + "epoch": 0.2508532423208191, + "grad_norm": 0.4688833261931172, + "learning_rate": 4.523840500902183e-05, + "loss": 0.5373, + "num_tokens": 140314284.0, + "step": 735 + }, + { + "epoch": 0.2525597269624573, + "grad_norm": 0.4261015924614589, + "learning_rate": 4.515999549014673e-05, + "loss": 0.5329, + "num_tokens": 141219364.0, + "step": 740 + }, + { + "epoch": 0.25426621160409557, + "grad_norm": 0.46451445077951103, + "learning_rate": 4.5081023367321916e-05, + "loss": 0.5369, + "num_tokens": 142303539.0, + "step": 745 + }, + { + "epoch": 0.25597269624573377, + "grad_norm": 0.4899621819611558, + "learning_rate": 4.500149115640468e-05, + "loss": 0.5736, + "num_tokens": 143301347.0, + "step": 750 + }, + { + "epoch": 0.257679180887372, + "grad_norm": 0.40185507186448355, + "learning_rate": 4.492140139109533e-05, + "loss": 0.529, + "num_tokens": 144231893.0, + "step": 755 + }, + { + "epoch": 0.2593856655290102, + "grad_norm": 0.3751786581425354, + "learning_rate": 4.484075662285647e-05, + "loss": 0.5366, + "num_tokens": 145160611.0, + "step": 760 + }, + { + "epoch": 0.26109215017064846, + "grad_norm": 0.383945095710832, + "learning_rate": 4.475955942083176e-05, + "loss": 0.5286, + "num_tokens": 146121565.0, + "step": 765 + }, + { + "epoch": 0.2627986348122867, + "grad_norm": 0.39293769351911984, + "learning_rate": 4.4677812371764e-05, + "loss": 0.5177, + "num_tokens": 147031619.0, + "step": 770 + }, + { + "epoch": 0.2645051194539249, + "grad_norm": 0.36541237345446015, + "learning_rate": 4.45955180799128e-05, + "loss": 0.5289, + "num_tokens": 147981269.0, + "step": 775 + }, + { + "epoch": 0.26621160409556316, + "grad_norm": 0.4692171769339349, + "learning_rate": 4.4512679166971553e-05, + "loss": 0.5489, + "num_tokens": 148964661.0, + "step": 780 + }, + { + "epoch": 0.26791808873720135, + "grad_norm": 0.3687554090744247, + "learning_rate": 4.442929827198395e-05, + "loss": 0.5471, + "num_tokens": 150008239.0, + "step": 785 + }, + { + "epoch": 0.2696245733788396, + "grad_norm": 0.45244715310494926, + "learning_rate": 4.43453780512599e-05, + "loss": 0.5466, + "num_tokens": 150937307.0, + "step": 790 + }, + { + "epoch": 0.2713310580204778, + "grad_norm": 0.4452320820857673, + "learning_rate": 4.4260921178290866e-05, + "loss": 0.5407, + "num_tokens": 151860116.0, + "step": 795 + }, + { + "epoch": 0.27303754266211605, + "grad_norm": 0.42732274617096166, + "learning_rate": 4.417593034366478e-05, + "loss": 0.5311, + "num_tokens": 152834849.0, + "step": 800 + }, + { + "epoch": 0.27474402730375425, + "grad_norm": 0.4052649603454169, + "learning_rate": 4.409040825498024e-05, + "loss": 0.5115, + "num_tokens": 153761800.0, + "step": 805 + }, + { + "epoch": 0.2764505119453925, + "grad_norm": 0.4397536584781063, + "learning_rate": 4.40043576367603e-05, + "loss": 0.5268, + "num_tokens": 154739335.0, + "step": 810 + }, + { + "epoch": 0.2781569965870307, + "grad_norm": 0.42381590061263635, + "learning_rate": 4.3917781230365677e-05, + "loss": 0.5554, + "num_tokens": 155726110.0, + "step": 815 + }, + { + "epoch": 0.27986348122866894, + "grad_norm": 0.4189254439755074, + "learning_rate": 4.383068179390739e-05, + "loss": 0.5435, + "num_tokens": 156709373.0, + "step": 820 + }, + { + "epoch": 0.2815699658703072, + "grad_norm": 0.4194447248910214, + "learning_rate": 4.3743062102158896e-05, + "loss": 0.5318, + "num_tokens": 157605031.0, + "step": 825 + }, + { + "epoch": 0.2832764505119454, + "grad_norm": 0.3731419235770229, + "learning_rate": 4.3654924946467724e-05, + "loss": 0.517, + "num_tokens": 158541316.0, + "step": 830 + }, + { + "epoch": 0.28498293515358364, + "grad_norm": 0.3749285811277496, + "learning_rate": 4.3566273134666525e-05, + "loss": 0.5494, + "num_tokens": 159525622.0, + "step": 835 + }, + { + "epoch": 0.28668941979522183, + "grad_norm": 0.4309015308982524, + "learning_rate": 4.3477109490983626e-05, + "loss": 0.5424, + "num_tokens": 160459756.0, + "step": 840 + }, + { + "epoch": 0.2883959044368601, + "grad_norm": 0.3986942602299617, + "learning_rate": 4.338743685595304e-05, + "loss": 0.5228, + "num_tokens": 161382919.0, + "step": 845 + }, + { + "epoch": 0.2901023890784983, + "grad_norm": 0.4258708102475343, + "learning_rate": 4.329725808632403e-05, + "loss": 0.5365, + "num_tokens": 162358277.0, + "step": 850 + }, + { + "epoch": 0.29180887372013653, + "grad_norm": 0.38918974623711616, + "learning_rate": 4.320657605497001e-05, + "loss": 0.5522, + "num_tokens": 163332894.0, + "step": 855 + }, + { + "epoch": 0.2935153583617747, + "grad_norm": 0.41923964286389137, + "learning_rate": 4.3115393650797095e-05, + "loss": 0.5384, + "num_tokens": 164401378.0, + "step": 860 + }, + { + "epoch": 0.295221843003413, + "grad_norm": 0.4156858487356743, + "learning_rate": 4.3023713778652074e-05, + "loss": 0.5049, + "num_tokens": 165316411.0, + "step": 865 + }, + { + "epoch": 0.29692832764505117, + "grad_norm": 0.39774241581331693, + "learning_rate": 4.2931539359229804e-05, + "loss": 0.5192, + "num_tokens": 166276916.0, + "step": 870 + }, + { + "epoch": 0.2986348122866894, + "grad_norm": 0.35406058199730683, + "learning_rate": 4.283887332898019e-05, + "loss": 0.5127, + "num_tokens": 167298025.0, + "step": 875 + }, + { + "epoch": 0.3003412969283277, + "grad_norm": 0.3843676536025804, + "learning_rate": 4.2745718640014696e-05, + "loss": 0.5318, + "num_tokens": 168250987.0, + "step": 880 + }, + { + "epoch": 0.30204778156996587, + "grad_norm": 0.38653674163211976, + "learning_rate": 4.265207826001219e-05, + "loss": 0.5336, + "num_tokens": 169245557.0, + "step": 885 + }, + { + "epoch": 0.3037542662116041, + "grad_norm": 0.4322065456917861, + "learning_rate": 4.255795517212451e-05, + "loss": 0.5489, + "num_tokens": 170217424.0, + "step": 890 + }, + { + "epoch": 0.3054607508532423, + "grad_norm": 0.41164873108292127, + "learning_rate": 4.246335237488136e-05, + "loss": 0.5171, + "num_tokens": 171143325.0, + "step": 895 + }, + { + "epoch": 0.30716723549488056, + "grad_norm": 0.4104853255150601, + "learning_rate": 4.236827288209478e-05, + "loss": 0.5223, + "num_tokens": 172160313.0, + "step": 900 + }, + { + "epoch": 0.30887372013651876, + "grad_norm": 0.43043638343636487, + "learning_rate": 4.2272719722763197e-05, + "loss": 0.5246, + "num_tokens": 173195128.0, + "step": 905 + }, + { + "epoch": 0.310580204778157, + "grad_norm": 0.41502896344603557, + "learning_rate": 4.217669594097485e-05, + "loss": 0.5379, + "num_tokens": 174112017.0, + "step": 910 + }, + { + "epoch": 0.3122866894197952, + "grad_norm": 0.40961938646342305, + "learning_rate": 4.208020459581087e-05, + "loss": 0.5343, + "num_tokens": 175151908.0, + "step": 915 + }, + { + "epoch": 0.31399317406143346, + "grad_norm": 0.4009346397822526, + "learning_rate": 4.19832487612478e-05, + "loss": 0.5057, + "num_tokens": 176115268.0, + "step": 920 + }, + { + "epoch": 0.31569965870307165, + "grad_norm": 0.38967085585332806, + "learning_rate": 4.1885831526059674e-05, + "loss": 0.5108, + "num_tokens": 177084976.0, + "step": 925 + }, + { + "epoch": 0.3174061433447099, + "grad_norm": 0.3386319476796638, + "learning_rate": 4.178795599371961e-05, + "loss": 0.4975, + "num_tokens": 178049137.0, + "step": 930 + }, + { + "epoch": 0.3191126279863481, + "grad_norm": 0.430097609031975, + "learning_rate": 4.168962528230096e-05, + "loss": 0.5321, + "num_tokens": 178990489.0, + "step": 935 + }, + { + "epoch": 0.32081911262798635, + "grad_norm": 0.3918509038411313, + "learning_rate": 4.1590842524377914e-05, + "loss": 0.5297, + "num_tokens": 179947208.0, + "step": 940 + }, + { + "epoch": 0.3225255972696246, + "grad_norm": 0.4113406051465179, + "learning_rate": 4.149161086692581e-05, + "loss": 0.5375, + "num_tokens": 180895300.0, + "step": 945 + }, + { + "epoch": 0.3242320819112628, + "grad_norm": 0.3812963085578539, + "learning_rate": 4.139193347122077e-05, + "loss": 0.5323, + "num_tokens": 181891310.0, + "step": 950 + }, + { + "epoch": 0.32593856655290104, + "grad_norm": 0.427163248722511, + "learning_rate": 4.1291813512739074e-05, + "loss": 0.53, + "num_tokens": 182829455.0, + "step": 955 + }, + { + "epoch": 0.32764505119453924, + "grad_norm": 0.3684699313638392, + "learning_rate": 4.1191254181055936e-05, + "loss": 0.52, + "num_tokens": 183776326.0, + "step": 960 + }, + { + "epoch": 0.3293515358361775, + "grad_norm": 0.36070078090014446, + "learning_rate": 4.1090258679743934e-05, + "loss": 0.5176, + "num_tokens": 184739434.0, + "step": 965 + }, + { + "epoch": 0.3310580204778157, + "grad_norm": 0.45870874889230046, + "learning_rate": 4.098883022627094e-05, + "loss": 0.5657, + "num_tokens": 185721070.0, + "step": 970 + }, + { + "epoch": 0.33276450511945393, + "grad_norm": 0.4188344945747834, + "learning_rate": 4.0886972051897594e-05, + "loss": 0.533, + "num_tokens": 186739113.0, + "step": 975 + }, + { + "epoch": 0.33447098976109213, + "grad_norm": 0.45500040021056365, + "learning_rate": 4.078468740157439e-05, + "loss": 0.537, + "num_tokens": 187730193.0, + "step": 980 + }, + { + "epoch": 0.3361774744027304, + "grad_norm": 0.40761293819970623, + "learning_rate": 4.068197953383832e-05, + "loss": 0.5221, + "num_tokens": 188652348.0, + "step": 985 + }, + { + "epoch": 0.3378839590443686, + "grad_norm": 0.4263098333005298, + "learning_rate": 4.0578851720709e-05, + "loss": 0.519, + "num_tokens": 189611145.0, + "step": 990 + }, + { + "epoch": 0.3395904436860068, + "grad_norm": 0.35780922785817854, + "learning_rate": 4.047530724758451e-05, + "loss": 0.5263, + "num_tokens": 190593764.0, + "step": 995 + }, + { + "epoch": 0.3412969283276451, + "grad_norm": 0.3827403926446559, + "learning_rate": 4.037134941313668e-05, + "loss": 0.5182, + "num_tokens": 191543591.0, + "step": 1000 + }, + { + "epoch": 0.3430034129692833, + "grad_norm": 0.391417552124784, + "learning_rate": 4.026698152920599e-05, + "loss": 0.514, + "num_tokens": 192536034.0, + "step": 1005 + }, + { + "epoch": 0.3447098976109215, + "grad_norm": 0.3750506973668557, + "learning_rate": 4.016220692069612e-05, + "loss": 0.5227, + "num_tokens": 193451364.0, + "step": 1010 + }, + { + "epoch": 0.3464163822525597, + "grad_norm": 0.40025338962482404, + "learning_rate": 4.005702892546798e-05, + "loss": 0.534, + "num_tokens": 194391841.0, + "step": 1015 + }, + { + "epoch": 0.34812286689419797, + "grad_norm": 0.36297817083661593, + "learning_rate": 3.9951450894233365e-05, + "loss": 0.5183, + "num_tokens": 195399830.0, + "step": 1020 + }, + { + "epoch": 0.34982935153583616, + "grad_norm": 0.3598098656728241, + "learning_rate": 3.984547619044827e-05, + "loss": 0.5115, + "num_tokens": 196363387.0, + "step": 1025 + }, + { + "epoch": 0.3515358361774744, + "grad_norm": 0.38480064839017475, + "learning_rate": 3.973910819020567e-05, + "loss": 0.5009, + "num_tokens": 197289380.0, + "step": 1030 + }, + { + "epoch": 0.3532423208191126, + "grad_norm": 0.37900568253469097, + "learning_rate": 3.963235028212802e-05, + "loss": 0.5334, + "num_tokens": 198263603.0, + "step": 1035 + }, + { + "epoch": 0.35494880546075086, + "grad_norm": 0.35505267262656565, + "learning_rate": 3.9525205867259246e-05, + "loss": 0.4977, + "num_tokens": 199248080.0, + "step": 1040 + }, + { + "epoch": 0.35665529010238906, + "grad_norm": 0.4178322580187183, + "learning_rate": 3.941767835895647e-05, + "loss": 0.5247, + "num_tokens": 200182740.0, + "step": 1045 + }, + { + "epoch": 0.3583617747440273, + "grad_norm": 0.3886991806809196, + "learning_rate": 3.9309771182781194e-05, + "loss": 0.5592, + "num_tokens": 201135153.0, + "step": 1050 + }, + { + "epoch": 0.36006825938566556, + "grad_norm": 0.3729591088065611, + "learning_rate": 3.9201487776390215e-05, + "loss": 0.5174, + "num_tokens": 202034499.0, + "step": 1055 + }, + { + "epoch": 0.36177474402730375, + "grad_norm": 0.43050604928861597, + "learning_rate": 3.90928315894261e-05, + "loss": 0.5203, + "num_tokens": 202936389.0, + "step": 1060 + }, + { + "epoch": 0.363481228668942, + "grad_norm": 0.3903868868255724, + "learning_rate": 3.898380608340728e-05, + "loss": 0.5121, + "num_tokens": 203808949.0, + "step": 1065 + }, + { + "epoch": 0.3651877133105802, + "grad_norm": 0.43315506398310644, + "learning_rate": 3.887441473161779e-05, + "loss": 0.5268, + "num_tokens": 204803047.0, + "step": 1070 + }, + { + "epoch": 0.36689419795221845, + "grad_norm": 0.3826063463200933, + "learning_rate": 3.87646610189966e-05, + "loss": 0.526, + "num_tokens": 205764413.0, + "step": 1075 + }, + { + "epoch": 0.36860068259385664, + "grad_norm": 0.3804292887775751, + "learning_rate": 3.8654548442026615e-05, + "loss": 0.5121, + "num_tokens": 206795183.0, + "step": 1080 + }, + { + "epoch": 0.3703071672354949, + "grad_norm": 0.386486637072244, + "learning_rate": 3.854408050862326e-05, + "loss": 0.5197, + "num_tokens": 207776278.0, + "step": 1085 + }, + { + "epoch": 0.3720136518771331, + "grad_norm": 0.3679656863557538, + "learning_rate": 3.843326073802275e-05, + "loss": 0.5109, + "num_tokens": 208806680.0, + "step": 1090 + }, + { + "epoch": 0.37372013651877134, + "grad_norm": 0.36341929778692034, + "learning_rate": 3.832209266066996e-05, + "loss": 0.5117, + "num_tokens": 209705965.0, + "step": 1095 + }, + { + "epoch": 0.37542662116040953, + "grad_norm": 0.36095775324919993, + "learning_rate": 3.821057981810597e-05, + "loss": 0.5173, + "num_tokens": 210620229.0, + "step": 1100 + }, + { + "epoch": 0.3771331058020478, + "grad_norm": 0.37920120200226964, + "learning_rate": 3.809872576285522e-05, + "loss": 0.5278, + "num_tokens": 211518941.0, + "step": 1105 + }, + { + "epoch": 0.378839590443686, + "grad_norm": 0.37125338141855646, + "learning_rate": 3.798653405831236e-05, + "loss": 0.5213, + "num_tokens": 212481056.0, + "step": 1110 + }, + { + "epoch": 0.38054607508532423, + "grad_norm": 0.39363430670991295, + "learning_rate": 3.78740082786287e-05, + "loss": 0.5081, + "num_tokens": 213401212.0, + "step": 1115 + }, + { + "epoch": 0.3822525597269625, + "grad_norm": 0.41301840155723674, + "learning_rate": 3.7761152008598356e-05, + "loss": 0.5262, + "num_tokens": 214296967.0, + "step": 1120 + }, + { + "epoch": 0.3839590443686007, + "grad_norm": 0.4004388299714662, + "learning_rate": 3.764796884354408e-05, + "loss": 0.5295, + "num_tokens": 215306580.0, + "step": 1125 + }, + { + "epoch": 0.3856655290102389, + "grad_norm": 0.35857577455632367, + "learning_rate": 3.7534462389202655e-05, + "loss": 0.5328, + "num_tokens": 216266686.0, + "step": 1130 + }, + { + "epoch": 0.3873720136518771, + "grad_norm": 0.357927002022941, + "learning_rate": 3.742063626161011e-05, + "loss": 0.5307, + "num_tokens": 217244190.0, + "step": 1135 + }, + { + "epoch": 0.3890784982935154, + "grad_norm": 0.4316739204850572, + "learning_rate": 3.7306494086986424e-05, + "loss": 0.5115, + "num_tokens": 218179883.0, + "step": 1140 + }, + { + "epoch": 0.39078498293515357, + "grad_norm": 0.3236368199361241, + "learning_rate": 3.7192039501620114e-05, + "loss": 0.5265, + "num_tokens": 219217201.0, + "step": 1145 + }, + { + "epoch": 0.3924914675767918, + "grad_norm": 0.38665727290508983, + "learning_rate": 3.7077276151752274e-05, + "loss": 0.5137, + "num_tokens": 220144855.0, + "step": 1150 + }, + { + "epoch": 0.39419795221843, + "grad_norm": 0.37991237354351537, + "learning_rate": 3.696220769346052e-05, + "loss": 0.515, + "num_tokens": 221131861.0, + "step": 1155 + }, + { + "epoch": 0.39590443686006827, + "grad_norm": 0.4047905746984203, + "learning_rate": 3.6846837792542446e-05, + "loss": 0.5289, + "num_tokens": 222093783.0, + "step": 1160 + }, + { + "epoch": 0.39761092150170646, + "grad_norm": 0.4226911202426522, + "learning_rate": 3.673117012439889e-05, + "loss": 0.5267, + "num_tokens": 223054352.0, + "step": 1165 + }, + { + "epoch": 0.3993174061433447, + "grad_norm": 0.37522461139025454, + "learning_rate": 3.6615208373916775e-05, + "loss": 0.4879, + "num_tokens": 223929151.0, + "step": 1170 + }, + { + "epoch": 0.40102389078498296, + "grad_norm": 0.40207189529772014, + "learning_rate": 3.6498956235351815e-05, + "loss": 0.5245, + "num_tokens": 224865728.0, + "step": 1175 + }, + { + "epoch": 0.40273037542662116, + "grad_norm": 0.3930165821540444, + "learning_rate": 3.6382417412210744e-05, + "loss": 0.5087, + "num_tokens": 225865685.0, + "step": 1180 + }, + { + "epoch": 0.4044368600682594, + "grad_norm": 0.3578549306424923, + "learning_rate": 3.6265595617133366e-05, + "loss": 0.4939, + "num_tokens": 226749326.0, + "step": 1185 + }, + { + "epoch": 0.4061433447098976, + "grad_norm": 0.36807883381787004, + "learning_rate": 3.6148494571774275e-05, + "loss": 0.5286, + "num_tokens": 227786006.0, + "step": 1190 + }, + { + "epoch": 0.40784982935153585, + "grad_norm": 0.41611237665417367, + "learning_rate": 3.603111800668428e-05, + "loss": 0.5099, + "num_tokens": 228763631.0, + "step": 1195 + }, + { + "epoch": 0.40955631399317405, + "grad_norm": 0.3799547117691007, + "learning_rate": 3.591346966119159e-05, + "loss": 0.5094, + "num_tokens": 229748231.0, + "step": 1200 + }, + { + "epoch": 0.4112627986348123, + "grad_norm": 0.3478677676406051, + "learning_rate": 3.579555328328265e-05, + "loss": 0.5117, + "num_tokens": 230738165.0, + "step": 1205 + }, + { + "epoch": 0.4129692832764505, + "grad_norm": 0.3296304695850409, + "learning_rate": 3.5677372629482775e-05, + "loss": 0.521, + "num_tokens": 231716185.0, + "step": 1210 + }, + { + "epoch": 0.41467576791808874, + "grad_norm": 0.382979600202463, + "learning_rate": 3.555893146473644e-05, + "loss": 0.5262, + "num_tokens": 232698142.0, + "step": 1215 + }, + { + "epoch": 0.41638225255972694, + "grad_norm": 0.35116928864638025, + "learning_rate": 3.5440233562287376e-05, + "loss": 0.5417, + "num_tokens": 233655900.0, + "step": 1220 + }, + { + "epoch": 0.4180887372013652, + "grad_norm": 0.3728136743436132, + "learning_rate": 3.532128270355832e-05, + "loss": 0.516, + "num_tokens": 234596302.0, + "step": 1225 + }, + { + "epoch": 0.4197952218430034, + "grad_norm": 0.4116335593681845, + "learning_rate": 3.520208267803059e-05, + "loss": 0.5242, + "num_tokens": 235502719.0, + "step": 1230 + }, + { + "epoch": 0.42150170648464164, + "grad_norm": 0.38580680323921696, + "learning_rate": 3.508263728312336e-05, + "loss": 0.5278, + "num_tokens": 236475023.0, + "step": 1235 + }, + { + "epoch": 0.4232081911262799, + "grad_norm": 0.38551884251450047, + "learning_rate": 3.496295032407263e-05, + "loss": 0.5229, + "num_tokens": 237433481.0, + "step": 1240 + }, + { + "epoch": 0.4249146757679181, + "grad_norm": 0.38557583134333984, + "learning_rate": 3.484302561381007e-05, + "loss": 0.5029, + "num_tokens": 238378423.0, + "step": 1245 + }, + { + "epoch": 0.42662116040955633, + "grad_norm": 0.40297119331954867, + "learning_rate": 3.47228669728415e-05, + "loss": 0.5288, + "num_tokens": 239310469.0, + "step": 1250 + }, + { + "epoch": 0.4283276450511945, + "grad_norm": 0.36323193941329485, + "learning_rate": 3.4602478229125197e-05, + "loss": 0.5178, + "num_tokens": 240265629.0, + "step": 1255 + }, + { + "epoch": 0.4300341296928328, + "grad_norm": 0.4010782882289214, + "learning_rate": 3.4481863217949964e-05, + "loss": 0.5211, + "num_tokens": 241153898.0, + "step": 1260 + }, + { + "epoch": 0.431740614334471, + "grad_norm": 0.3725440207487266, + "learning_rate": 3.43610257818129e-05, + "loss": 0.5339, + "num_tokens": 242074086.0, + "step": 1265 + }, + { + "epoch": 0.4334470989761092, + "grad_norm": 0.3573796647038546, + "learning_rate": 3.4239969770297033e-05, + "loss": 0.5275, + "num_tokens": 243032696.0, + "step": 1270 + }, + { + "epoch": 0.4351535836177474, + "grad_norm": 0.3500180473141626, + "learning_rate": 3.411869903994867e-05, + "loss": 0.5237, + "num_tokens": 244052484.0, + "step": 1275 + }, + { + "epoch": 0.43686006825938567, + "grad_norm": 0.3630134268573173, + "learning_rate": 3.399721745415451e-05, + "loss": 0.4863, + "num_tokens": 245008254.0, + "step": 1280 + }, + { + "epoch": 0.43856655290102387, + "grad_norm": 0.3276111054395765, + "learning_rate": 3.38755288830186e-05, + "loss": 0.5239, + "num_tokens": 246076299.0, + "step": 1285 + }, + { + "epoch": 0.4402730375426621, + "grad_norm": 0.37922721950544974, + "learning_rate": 3.375363720323904e-05, + "loss": 0.5558, + "num_tokens": 247016964.0, + "step": 1290 + }, + { + "epoch": 0.44197952218430037, + "grad_norm": 0.4053020528274456, + "learning_rate": 3.363154629798444e-05, + "loss": 0.4991, + "num_tokens": 247913243.0, + "step": 1295 + }, + { + "epoch": 0.44368600682593856, + "grad_norm": 0.39968767139036077, + "learning_rate": 3.350926005677027e-05, + "loss": 0.5163, + "num_tokens": 248791992.0, + "step": 1300 + }, + { + "epoch": 0.4453924914675768, + "grad_norm": 0.4196675646397248, + "learning_rate": 3.338678237533491e-05, + "loss": 0.5155, + "num_tokens": 249736240.0, + "step": 1305 + }, + { + "epoch": 0.447098976109215, + "grad_norm": 0.36733233226120704, + "learning_rate": 3.326411715551559e-05, + "loss": 0.5187, + "num_tokens": 250713070.0, + "step": 1310 + }, + { + "epoch": 0.44880546075085326, + "grad_norm": 0.3526872487151841, + "learning_rate": 3.314126830512397e-05, + "loss": 0.5183, + "num_tokens": 251635307.0, + "step": 1315 + }, + { + "epoch": 0.45051194539249145, + "grad_norm": 0.3666740114223966, + "learning_rate": 3.3018239737821806e-05, + "loss": 0.4913, + "num_tokens": 252648795.0, + "step": 1320 + }, + { + "epoch": 0.4522184300341297, + "grad_norm": 0.3865152760583026, + "learning_rate": 3.289503537299616e-05, + "loss": 0.5326, + "num_tokens": 253618343.0, + "step": 1325 + }, + { + "epoch": 0.4539249146757679, + "grad_norm": 0.3882700849297493, + "learning_rate": 3.2771659135634564e-05, + "loss": 0.5033, + "num_tokens": 254539106.0, + "step": 1330 + }, + { + "epoch": 0.45563139931740615, + "grad_norm": 0.37283505962982216, + "learning_rate": 3.2648114956200005e-05, + "loss": 0.5134, + "num_tokens": 255475551.0, + "step": 1335 + }, + { + "epoch": 0.45733788395904434, + "grad_norm": 0.3736180838966003, + "learning_rate": 3.2524406770505675e-05, + "loss": 0.5212, + "num_tokens": 256460069.0, + "step": 1340 + }, + { + "epoch": 0.4590443686006826, + "grad_norm": 0.3711824529096275, + "learning_rate": 3.240053851958961e-05, + "loss": 0.4986, + "num_tokens": 257384246.0, + "step": 1345 + }, + { + "epoch": 0.46075085324232085, + "grad_norm": 0.35194816416518127, + "learning_rate": 3.227651414958912e-05, + "loss": 0.4996, + "num_tokens": 258439462.0, + "step": 1350 + }, + { + "epoch": 0.46245733788395904, + "grad_norm": 0.3636023146472485, + "learning_rate": 3.2152337611615096e-05, + "loss": 0.5128, + "num_tokens": 259419905.0, + "step": 1355 + }, + { + "epoch": 0.4641638225255973, + "grad_norm": 0.3499395456709178, + "learning_rate": 3.202801286162611e-05, + "loss": 0.529, + "num_tokens": 260499223.0, + "step": 1360 + }, + { + "epoch": 0.4658703071672355, + "grad_norm": 0.344200324854834, + "learning_rate": 3.1903543860302445e-05, + "loss": 0.4954, + "num_tokens": 261442637.0, + "step": 1365 + }, + { + "epoch": 0.46757679180887374, + "grad_norm": 0.34635826868295416, + "learning_rate": 3.1778934572919805e-05, + "loss": 0.5053, + "num_tokens": 262428104.0, + "step": 1370 + }, + { + "epoch": 0.46928327645051193, + "grad_norm": 0.3434947744560547, + "learning_rate": 3.165418896922313e-05, + "loss": 0.4892, + "num_tokens": 263310660.0, + "step": 1375 + }, + { + "epoch": 0.4709897610921502, + "grad_norm": 0.36603548416607035, + "learning_rate": 3.152931102330002e-05, + "loss": 0.5193, + "num_tokens": 264331327.0, + "step": 1380 + }, + { + "epoch": 0.4726962457337884, + "grad_norm": 0.36277672500545255, + "learning_rate": 3.140430471345419e-05, + "loss": 0.5103, + "num_tokens": 265270147.0, + "step": 1385 + }, + { + "epoch": 0.47440273037542663, + "grad_norm": 0.36093915798723425, + "learning_rate": 3.127917402207871e-05, + "loss": 0.5125, + "num_tokens": 266242185.0, + "step": 1390 + }, + { + "epoch": 0.4761092150170648, + "grad_norm": 0.3391219737377559, + "learning_rate": 3.115392293552915e-05, + "loss": 0.5119, + "num_tokens": 267191130.0, + "step": 1395 + }, + { + "epoch": 0.4778156996587031, + "grad_norm": 0.35339290653989003, + "learning_rate": 3.1028555443996544e-05, + "loss": 0.5099, + "num_tokens": 268142845.0, + "step": 1400 + }, + { + "epoch": 0.47952218430034127, + "grad_norm": 0.3520073614880941, + "learning_rate": 3.090307554138033e-05, + "loss": 0.527, + "num_tokens": 269116555.0, + "step": 1405 + }, + { + "epoch": 0.4812286689419795, + "grad_norm": 0.32746517176816037, + "learning_rate": 3.0777487225161096e-05, + "loss": 0.5171, + "num_tokens": 270078357.0, + "step": 1410 + }, + { + "epoch": 0.48293515358361777, + "grad_norm": 0.3801899716265933, + "learning_rate": 3.065179449627316e-05, + "loss": 0.5179, + "num_tokens": 271065401.0, + "step": 1415 + }, + { + "epoch": 0.48464163822525597, + "grad_norm": 0.34972336470548876, + "learning_rate": 3.0526001358977254e-05, + "loss": 0.5192, + "num_tokens": 272018748.0, + "step": 1420 + }, + { + "epoch": 0.4863481228668942, + "grad_norm": 0.4088433608953109, + "learning_rate": 3.0400111820732802e-05, + "loss": 0.5202, + "num_tokens": 273051158.0, + "step": 1425 + }, + { + "epoch": 0.4880546075085324, + "grad_norm": 0.4045530258228776, + "learning_rate": 3.0274129892070368e-05, + "loss": 0.5363, + "num_tokens": 274027158.0, + "step": 1430 + }, + { + "epoch": 0.48976109215017066, + "grad_norm": 0.37760264633069307, + "learning_rate": 3.014805958646383e-05, + "loss": 0.5071, + "num_tokens": 274976608.0, + "step": 1435 + }, + { + "epoch": 0.49146757679180886, + "grad_norm": 0.41544044167849326, + "learning_rate": 3.002190492020255e-05, + "loss": 0.5336, + "num_tokens": 275897357.0, + "step": 1440 + }, + { + "epoch": 0.4931740614334471, + "grad_norm": 0.3621173202789576, + "learning_rate": 2.9895669912263393e-05, + "loss": 0.4884, + "num_tokens": 276767022.0, + "step": 1445 + }, + { + "epoch": 0.4948805460750853, + "grad_norm": 0.35971250539401595, + "learning_rate": 2.9769358584182732e-05, + "loss": 0.4929, + "num_tokens": 277733458.0, + "step": 1450 + }, + { + "epoch": 0.49658703071672355, + "grad_norm": 0.3333198556836057, + "learning_rate": 2.9642974959928293e-05, + "loss": 0.5181, + "num_tokens": 278655070.0, + "step": 1455 + }, + { + "epoch": 0.49829351535836175, + "grad_norm": 0.40269115529983035, + "learning_rate": 2.9516523065771e-05, + "loss": 0.5092, + "num_tokens": 279550428.0, + "step": 1460 + }, + { + "epoch": 0.5, + "grad_norm": 0.3375843000024675, + "learning_rate": 2.9390006930156683e-05, + "loss": 0.5035, + "num_tokens": 280592599.0, + "step": 1465 + }, + { + "epoch": 0.5017064846416383, + "grad_norm": 0.33837995836306645, + "learning_rate": 2.9263430583577715e-05, + "loss": 0.4936, + "num_tokens": 281502549.0, + "step": 1470 + }, + { + "epoch": 0.5034129692832765, + "grad_norm": 0.3491138042125671, + "learning_rate": 2.9136798058444704e-05, + "loss": 0.5186, + "num_tokens": 282554594.0, + "step": 1475 + }, + { + "epoch": 0.5051194539249146, + "grad_norm": 0.36149705403685856, + "learning_rate": 2.9010113388957906e-05, + "loss": 0.4996, + "num_tokens": 283508120.0, + "step": 1480 + }, + { + "epoch": 0.5068259385665529, + "grad_norm": 0.3682570368717468, + "learning_rate": 2.8883380610978804e-05, + "loss": 0.4868, + "num_tokens": 284430674.0, + "step": 1485 + }, + { + "epoch": 0.5085324232081911, + "grad_norm": 0.3450199220270282, + "learning_rate": 2.875660376190149e-05, + "loss": 0.5225, + "num_tokens": 285480194.0, + "step": 1490 + }, + { + "epoch": 0.5102389078498294, + "grad_norm": 0.35852992172619397, + "learning_rate": 2.8629786880524057e-05, + "loss": 0.5044, + "num_tokens": 286426656.0, + "step": 1495 + }, + { + "epoch": 0.5119453924914675, + "grad_norm": 0.35758605357343837, + "learning_rate": 2.8502934006919908e-05, + "loss": 0.531, + "num_tokens": 287419124.0, + "step": 1500 + }, + { + "epoch": 0.5136518771331058, + "grad_norm": 0.3679994961058525, + "learning_rate": 2.83760491823091e-05, + "loss": 0.4891, + "num_tokens": 288343301.0, + "step": 1505 + }, + { + "epoch": 0.515358361774744, + "grad_norm": 0.386078898523489, + "learning_rate": 2.824913644892955e-05, + "loss": 0.4912, + "num_tokens": 289306762.0, + "step": 1510 + }, + { + "epoch": 0.5170648464163823, + "grad_norm": 0.33205328887110974, + "learning_rate": 2.8122199849908286e-05, + "loss": 0.5047, + "num_tokens": 290236538.0, + "step": 1515 + }, + { + "epoch": 0.5187713310580204, + "grad_norm": 0.36598920383011924, + "learning_rate": 2.7995243429132644e-05, + "loss": 0.5082, + "num_tokens": 291105578.0, + "step": 1520 + }, + { + "epoch": 0.5204778156996587, + "grad_norm": 0.3577740364047028, + "learning_rate": 2.7868271231121406e-05, + "loss": 0.5271, + "num_tokens": 292089939.0, + "step": 1525 + }, + { + "epoch": 0.5221843003412969, + "grad_norm": 0.3395160116353141, + "learning_rate": 2.7741287300896013e-05, + "loss": 0.4958, + "num_tokens": 293082816.0, + "step": 1530 + }, + { + "epoch": 0.5238907849829352, + "grad_norm": 0.3750202797810289, + "learning_rate": 2.7614295683851637e-05, + "loss": 0.5043, + "num_tokens": 293957075.0, + "step": 1535 + }, + { + "epoch": 0.5255972696245734, + "grad_norm": 0.36593612437784134, + "learning_rate": 2.7487300425628347e-05, + "loss": 0.4999, + "num_tokens": 294930434.0, + "step": 1540 + }, + { + "epoch": 0.5273037542662116, + "grad_norm": 0.36077082410017175, + "learning_rate": 2.7360305571982213e-05, + "loss": 0.517, + "num_tokens": 295898443.0, + "step": 1545 + }, + { + "epoch": 0.5290102389078498, + "grad_norm": 0.34084324496634494, + "learning_rate": 2.723331516865641e-05, + "loss": 0.5042, + "num_tokens": 296842807.0, + "step": 1550 + }, + { + "epoch": 0.5307167235494881, + "grad_norm": 0.3473414935833505, + "learning_rate": 2.7106333261252342e-05, + "loss": 0.5141, + "num_tokens": 297874811.0, + "step": 1555 + }, + { + "epoch": 0.5324232081911263, + "grad_norm": 0.37129914119401464, + "learning_rate": 2.697936389510073e-05, + "loss": 0.5019, + "num_tokens": 298726998.0, + "step": 1560 + }, + { + "epoch": 0.5341296928327645, + "grad_norm": 0.36028226696611454, + "learning_rate": 2.685241111513281e-05, + "loss": 0.5116, + "num_tokens": 299723782.0, + "step": 1565 + }, + { + "epoch": 0.5358361774744027, + "grad_norm": 0.3655240725721465, + "learning_rate": 2.6725478965751378e-05, + "loss": 0.4864, + "num_tokens": 300660125.0, + "step": 1570 + }, + { + "epoch": 0.537542662116041, + "grad_norm": 0.3527263064768574, + "learning_rate": 2.6598571490702013e-05, + "loss": 0.4997, + "num_tokens": 301489572.0, + "step": 1575 + }, + { + "epoch": 0.5392491467576792, + "grad_norm": 0.3514385599593041, + "learning_rate": 2.6471692732944227e-05, + "loss": 0.4773, + "num_tokens": 302437719.0, + "step": 1580 + }, + { + "epoch": 0.5409556313993175, + "grad_norm": 0.5417409716600186, + "learning_rate": 2.634484673452265e-05, + "loss": 0.5256, + "num_tokens": 303463770.0, + "step": 1585 + }, + { + "epoch": 0.5426621160409556, + "grad_norm": 0.3711273299009024, + "learning_rate": 2.6218037536438315e-05, + "loss": 0.5067, + "num_tokens": 304343518.0, + "step": 1590 + }, + { + "epoch": 0.5443686006825939, + "grad_norm": 0.38356094617087266, + "learning_rate": 2.6091269178519885e-05, + "loss": 0.5195, + "num_tokens": 305270656.0, + "step": 1595 + }, + { + "epoch": 0.5460750853242321, + "grad_norm": 0.36905989432450675, + "learning_rate": 2.5964545699294906e-05, + "loss": 0.5049, + "num_tokens": 306180961.0, + "step": 1600 + }, + { + "epoch": 0.5477815699658704, + "grad_norm": 0.3692380599855385, + "learning_rate": 2.583787113586126e-05, + "loss": 0.5315, + "num_tokens": 307152419.0, + "step": 1605 + }, + { + "epoch": 0.5494880546075085, + "grad_norm": 0.3565584564996635, + "learning_rate": 2.571124952375845e-05, + "loss": 0.5028, + "num_tokens": 308076053.0, + "step": 1610 + }, + { + "epoch": 0.5511945392491467, + "grad_norm": 0.6182642964929036, + "learning_rate": 2.55846848968391e-05, + "loss": 0.5168, + "num_tokens": 309029777.0, + "step": 1615 + }, + { + "epoch": 0.552901023890785, + "grad_norm": 0.3804959240952762, + "learning_rate": 2.545818128714043e-05, + "loss": 0.4985, + "num_tokens": 310003006.0, + "step": 1620 + }, + { + "epoch": 0.5546075085324232, + "grad_norm": 0.3585357836902996, + "learning_rate": 2.533174272475579e-05, + "loss": 0.4889, + "num_tokens": 310946881.0, + "step": 1625 + }, + { + "epoch": 0.5563139931740614, + "grad_norm": 0.36197226313264375, + "learning_rate": 2.52053732377063e-05, + "loss": 0.5011, + "num_tokens": 311908102.0, + "step": 1630 + }, + { + "epoch": 0.5580204778156996, + "grad_norm": 0.40032263570721643, + "learning_rate": 2.5079076851812476e-05, + "loss": 0.5089, + "num_tokens": 312808887.0, + "step": 1635 + }, + { + "epoch": 0.5597269624573379, + "grad_norm": 0.3451971824355068, + "learning_rate": 2.4952857590566043e-05, + "loss": 0.493, + "num_tokens": 313777123.0, + "step": 1640 + }, + { + "epoch": 0.5614334470989761, + "grad_norm": 0.3900504406034111, + "learning_rate": 2.4826719475001714e-05, + "loss": 0.5094, + "num_tokens": 314739056.0, + "step": 1645 + }, + { + "epoch": 0.5631399317406144, + "grad_norm": 0.3493313787920549, + "learning_rate": 2.4700666523569106e-05, + "loss": 0.4898, + "num_tokens": 315742426.0, + "step": 1650 + }, + { + "epoch": 0.5648464163822525, + "grad_norm": 0.3438326732935695, + "learning_rate": 2.4574702752004703e-05, + "loss": 0.5175, + "num_tokens": 316737000.0, + "step": 1655 + }, + { + "epoch": 0.5665529010238908, + "grad_norm": 0.35276865905225485, + "learning_rate": 2.444883217320395e-05, + "loss": 0.495, + "num_tokens": 317577413.0, + "step": 1660 + }, + { + "epoch": 0.568259385665529, + "grad_norm": 0.3407109448961259, + "learning_rate": 2.4323058797093395e-05, + "loss": 0.504, + "num_tokens": 318577669.0, + "step": 1665 + }, + { + "epoch": 0.5699658703071673, + "grad_norm": 0.4001170454014252, + "learning_rate": 2.4197386630502965e-05, + "loss": 0.4969, + "num_tokens": 319557900.0, + "step": 1670 + }, + { + "epoch": 0.5716723549488054, + "grad_norm": 0.34709303207454156, + "learning_rate": 2.407181967703826e-05, + "loss": 0.5009, + "num_tokens": 320511805.0, + "step": 1675 + }, + { + "epoch": 0.5733788395904437, + "grad_norm": 0.371204319077067, + "learning_rate": 2.3946361936953092e-05, + "loss": 0.5075, + "num_tokens": 321462994.0, + "step": 1680 + }, + { + "epoch": 0.5750853242320819, + "grad_norm": 0.35657413743817584, + "learning_rate": 2.382101740702199e-05, + "loss": 0.4846, + "num_tokens": 322380429.0, + "step": 1685 + }, + { + "epoch": 0.5767918088737202, + "grad_norm": 0.3373065447243538, + "learning_rate": 2.369579008041286e-05, + "loss": 0.5064, + "num_tokens": 323355363.0, + "step": 1690 + }, + { + "epoch": 0.5784982935153583, + "grad_norm": 0.34242628423967963, + "learning_rate": 2.3570683946559835e-05, + "loss": 0.5057, + "num_tokens": 324276849.0, + "step": 1695 + }, + { + "epoch": 0.5802047781569966, + "grad_norm": 0.3424829522431512, + "learning_rate": 2.3445702991036138e-05, + "loss": 0.4915, + "num_tokens": 325155802.0, + "step": 1700 + }, + { + "epoch": 0.5819112627986348, + "grad_norm": 0.38418791267218005, + "learning_rate": 2.332085119542711e-05, + "loss": 0.4747, + "num_tokens": 325996402.0, + "step": 1705 + }, + { + "epoch": 0.5836177474402731, + "grad_norm": 0.3720571745285186, + "learning_rate": 2.319613253720338e-05, + "loss": 0.5314, + "num_tokens": 326956942.0, + "step": 1710 + }, + { + "epoch": 0.5853242320819113, + "grad_norm": 0.35719133127267255, + "learning_rate": 2.3071550989594133e-05, + "loss": 0.5122, + "num_tokens": 327985119.0, + "step": 1715 + }, + { + "epoch": 0.5870307167235495, + "grad_norm": 0.38622831687102893, + "learning_rate": 2.2947110521460567e-05, + "loss": 0.4888, + "num_tokens": 328885222.0, + "step": 1720 + }, + { + "epoch": 0.5887372013651877, + "grad_norm": 0.35596529449616776, + "learning_rate": 2.2822815097169447e-05, + "loss": 0.5065, + "num_tokens": 329923181.0, + "step": 1725 + }, + { + "epoch": 0.590443686006826, + "grad_norm": 0.36830928128106777, + "learning_rate": 2.269866867646675e-05, + "loss": 0.4908, + "num_tokens": 330878184.0, + "step": 1730 + }, + { + "epoch": 0.5921501706484642, + "grad_norm": 0.333509463984278, + "learning_rate": 2.2574675214351622e-05, + "loss": 0.4683, + "num_tokens": 331849770.0, + "step": 1735 + }, + { + "epoch": 0.5938566552901023, + "grad_norm": 0.3657723804929488, + "learning_rate": 2.245083866095029e-05, + "loss": 0.498, + "num_tokens": 332821824.0, + "step": 1740 + }, + { + "epoch": 0.5955631399317406, + "grad_norm": 0.3709027281629384, + "learning_rate": 2.2327162961390254e-05, + "loss": 0.5101, + "num_tokens": 333794769.0, + "step": 1745 + }, + { + "epoch": 0.5972696245733788, + "grad_norm": 0.34103457669200804, + "learning_rate": 2.2203652055674633e-05, + "loss": 0.4935, + "num_tokens": 334798404.0, + "step": 1750 + }, + { + "epoch": 0.5989761092150171, + "grad_norm": 0.3545726343474071, + "learning_rate": 2.20803098785566e-05, + "loss": 0.4833, + "num_tokens": 335687213.0, + "step": 1755 + }, + { + "epoch": 0.6006825938566553, + "grad_norm": 0.3340674324445749, + "learning_rate": 2.1957140359414063e-05, + "loss": 0.4651, + "num_tokens": 336651049.0, + "step": 1760 + }, + { + "epoch": 0.6023890784982935, + "grad_norm": 0.36127627803351964, + "learning_rate": 2.1834147422124463e-05, + "loss": 0.4772, + "num_tokens": 337519072.0, + "step": 1765 + }, + { + "epoch": 0.6040955631399317, + "grad_norm": 0.3835855921099779, + "learning_rate": 2.1711334984939767e-05, + "loss": 0.5155, + "num_tokens": 338438000.0, + "step": 1770 + }, + { + "epoch": 0.60580204778157, + "grad_norm": 0.30765262583419745, + "learning_rate": 2.1588706960361682e-05, + "loss": 0.5165, + "num_tokens": 339506233.0, + "step": 1775 + }, + { + "epoch": 0.6075085324232082, + "grad_norm": 0.3248110534125549, + "learning_rate": 2.146626725501697e-05, + "loss": 0.4952, + "num_tokens": 340414967.0, + "step": 1780 + }, + { + "epoch": 0.6092150170648464, + "grad_norm": 0.3774926928186119, + "learning_rate": 2.134401976953299e-05, + "loss": 0.5206, + "num_tokens": 341348316.0, + "step": 1785 + }, + { + "epoch": 0.6109215017064846, + "grad_norm": 0.3532666917926633, + "learning_rate": 2.1221968398413477e-05, + "loss": 0.4882, + "num_tokens": 342244656.0, + "step": 1790 + }, + { + "epoch": 0.6126279863481229, + "grad_norm": 0.3502010878134099, + "learning_rate": 2.1100117029914434e-05, + "loss": 0.4849, + "num_tokens": 343244894.0, + "step": 1795 + }, + { + "epoch": 0.6143344709897611, + "grad_norm": 0.34456710514750377, + "learning_rate": 2.0978469545920254e-05, + "loss": 0.5066, + "num_tokens": 344295726.0, + "step": 1800 + }, + { + "epoch": 0.6160409556313993, + "grad_norm": 0.32734359098421567, + "learning_rate": 2.0857029821820113e-05, + "loss": 0.5014, + "num_tokens": 345312852.0, + "step": 1805 + }, + { + "epoch": 0.6177474402730375, + "grad_norm": 0.37196057474243177, + "learning_rate": 2.0735801726384436e-05, + "loss": 0.5103, + "num_tokens": 346263433.0, + "step": 1810 + }, + { + "epoch": 0.6194539249146758, + "grad_norm": 0.32459376752536473, + "learning_rate": 2.0614789121641688e-05, + "loss": 0.5038, + "num_tokens": 347219412.0, + "step": 1815 + }, + { + "epoch": 0.621160409556314, + "grad_norm": 0.36986659746774475, + "learning_rate": 2.0493995862755333e-05, + "loss": 0.4975, + "num_tokens": 348137882.0, + "step": 1820 + }, + { + "epoch": 0.6228668941979523, + "grad_norm": 0.4026711619598764, + "learning_rate": 2.0373425797901024e-05, + "loss": 0.5169, + "num_tokens": 349064203.0, + "step": 1825 + }, + { + "epoch": 0.6245733788395904, + "grad_norm": 0.3497696696697358, + "learning_rate": 2.0253082768143976e-05, + "loss": 0.4985, + "num_tokens": 349987787.0, + "step": 1830 + }, + { + "epoch": 0.6262798634812287, + "grad_norm": 0.35282576206861677, + "learning_rate": 2.0132970607316677e-05, + "loss": 0.4961, + "num_tokens": 350963679.0, + "step": 1835 + }, + { + "epoch": 0.6279863481228669, + "grad_norm": 0.33944555479530397, + "learning_rate": 2.0013093141896634e-05, + "loss": 0.4743, + "num_tokens": 351875623.0, + "step": 1840 + }, + { + "epoch": 0.6296928327645052, + "grad_norm": 0.3720204410754147, + "learning_rate": 1.989345419088458e-05, + "loss": 0.4853, + "num_tokens": 352834128.0, + "step": 1845 + }, + { + "epoch": 0.6313993174061433, + "grad_norm": 0.3288718937132713, + "learning_rate": 1.9774057565682768e-05, + "loss": 0.4954, + "num_tokens": 353796065.0, + "step": 1850 + }, + { + "epoch": 0.6331058020477816, + "grad_norm": 0.3227249405857713, + "learning_rate": 1.965490706997351e-05, + "loss": 0.4869, + "num_tokens": 354752780.0, + "step": 1855 + }, + { + "epoch": 0.6348122866894198, + "grad_norm": 0.3312191918205175, + "learning_rate": 1.9536006499598085e-05, + "loss": 0.4953, + "num_tokens": 355697743.0, + "step": 1860 + }, + { + "epoch": 0.636518771331058, + "grad_norm": 0.3273924968177549, + "learning_rate": 1.941735964243574e-05, + "loss": 0.4905, + "num_tokens": 356707970.0, + "step": 1865 + }, + { + "epoch": 0.6382252559726962, + "grad_norm": 0.3733694588578265, + "learning_rate": 1.9298970278283046e-05, + "loss": 0.5312, + "num_tokens": 357607500.0, + "step": 1870 + }, + { + "epoch": 0.6399317406143344, + "grad_norm": 0.3377666062483733, + "learning_rate": 1.918084217873349e-05, + "loss": 0.5072, + "num_tokens": 358549752.0, + "step": 1875 + }, + { + "epoch": 0.6416382252559727, + "grad_norm": 0.3795132924337565, + "learning_rate": 1.90629791070573e-05, + "loss": 0.5073, + "num_tokens": 359496768.0, + "step": 1880 + }, + { + "epoch": 0.643344709897611, + "grad_norm": 0.3393948612318191, + "learning_rate": 1.8945384818081574e-05, + "loss": 0.4666, + "num_tokens": 360449477.0, + "step": 1885 + }, + { + "epoch": 0.6450511945392492, + "grad_norm": 0.3362849089493312, + "learning_rate": 1.882806305807067e-05, + "loss": 0.4991, + "num_tokens": 361389017.0, + "step": 1890 + }, + { + "epoch": 0.6467576791808873, + "grad_norm": 0.34043285210954316, + "learning_rate": 1.871101756460682e-05, + "loss": 0.4755, + "num_tokens": 362299106.0, + "step": 1895 + }, + { + "epoch": 0.6484641638225256, + "grad_norm": 0.3496235222336366, + "learning_rate": 1.8594252066471108e-05, + "loss": 0.4994, + "num_tokens": 363249804.0, + "step": 1900 + }, + { + "epoch": 0.6501706484641638, + "grad_norm": 0.35219312570924277, + "learning_rate": 1.847777028352463e-05, + "loss": 0.505, + "num_tokens": 364170107.0, + "step": 1905 + }, + { + "epoch": 0.6518771331058021, + "grad_norm": 0.3675703568778994, + "learning_rate": 1.8361575926590034e-05, + "loss": 0.4798, + "num_tokens": 365084839.0, + "step": 1910 + }, + { + "epoch": 0.6535836177474402, + "grad_norm": 0.33143812395942357, + "learning_rate": 1.8245672697333288e-05, + "loss": 0.4933, + "num_tokens": 365997466.0, + "step": 1915 + }, + { + "epoch": 0.6552901023890785, + "grad_norm": 0.32956885993414253, + "learning_rate": 1.8130064288145737e-05, + "loss": 0.4724, + "num_tokens": 366923656.0, + "step": 1920 + }, + { + "epoch": 0.6569965870307167, + "grad_norm": 0.3282570628204338, + "learning_rate": 1.801475438202648e-05, + "loss": 0.5023, + "num_tokens": 367958683.0, + "step": 1925 + }, + { + "epoch": 0.658703071672355, + "grad_norm": 0.33433791259781837, + "learning_rate": 1.789974665246507e-05, + "loss": 0.5161, + "num_tokens": 368897813.0, + "step": 1930 + }, + { + "epoch": 0.6604095563139932, + "grad_norm": 0.33188996124132675, + "learning_rate": 1.7785044763324415e-05, + "loss": 0.4924, + "num_tokens": 369808844.0, + "step": 1935 + }, + { + "epoch": 0.6621160409556314, + "grad_norm": 0.3305327165261152, + "learning_rate": 1.7670652368724144e-05, + "loss": 0.4928, + "num_tokens": 370786942.0, + "step": 1940 + }, + { + "epoch": 0.6638225255972696, + "grad_norm": 0.3292219960944428, + "learning_rate": 1.7556573112924135e-05, + "loss": 0.4675, + "num_tokens": 371657863.0, + "step": 1945 + }, + { + "epoch": 0.6655290102389079, + "grad_norm": 0.31411324035322963, + "learning_rate": 1.7442810630208446e-05, + "loss": 0.4831, + "num_tokens": 372630696.0, + "step": 1950 + }, + { + "epoch": 0.6672354948805461, + "grad_norm": 0.3217654495266495, + "learning_rate": 1.7329368544769487e-05, + "loss": 0.5029, + "num_tokens": 373650740.0, + "step": 1955 + }, + { + "epoch": 0.6689419795221843, + "grad_norm": 0.30398529228133475, + "learning_rate": 1.721625047059265e-05, + "loss": 0.4927, + "num_tokens": 374628223.0, + "step": 1960 + }, + { + "epoch": 0.6706484641638225, + "grad_norm": 0.3690454794887621, + "learning_rate": 1.7103460011341084e-05, + "loss": 0.4882, + "num_tokens": 375573909.0, + "step": 1965 + }, + { + "epoch": 0.6723549488054608, + "grad_norm": 0.32789625269764505, + "learning_rate": 1.699100076024099e-05, + "loss": 0.4697, + "num_tokens": 376493989.0, + "step": 1970 + }, + { + "epoch": 0.674061433447099, + "grad_norm": 0.3425335012653658, + "learning_rate": 1.6878876299967018e-05, + "loss": 0.4706, + "num_tokens": 377479804.0, + "step": 1975 + }, + { + "epoch": 0.6757679180887372, + "grad_norm": 0.3506912584928773, + "learning_rate": 1.6767090202528268e-05, + "loss": 0.4884, + "num_tokens": 378392822.0, + "step": 1980 + }, + { + "epoch": 0.6774744027303754, + "grad_norm": 0.3366127448224504, + "learning_rate": 1.6655646029154402e-05, + "loss": 0.4757, + "num_tokens": 379328234.0, + "step": 1985 + }, + { + "epoch": 0.6791808873720137, + "grad_norm": 0.33730121762767445, + "learning_rate": 1.6544547330182234e-05, + "loss": 0.4683, + "num_tokens": 380308538.0, + "step": 1990 + }, + { + "epoch": 0.6808873720136519, + "grad_norm": 0.3150426444385526, + "learning_rate": 1.6433797644942633e-05, + "loss": 0.4975, + "num_tokens": 381210797.0, + "step": 1995 + }, + { + "epoch": 0.6825938566552902, + "grad_norm": 0.31875516644753304, + "learning_rate": 1.63234005016477e-05, + "loss": 0.4942, + "num_tokens": 382166430.0, + "step": 2000 + }, + { + "epoch": 0.6843003412969283, + "grad_norm": 0.32666915132106794, + "learning_rate": 1.6213359417278473e-05, + "loss": 0.5085, + "num_tokens": 383179056.0, + "step": 2005 + }, + { + "epoch": 0.6860068259385665, + "grad_norm": 0.32712173146500084, + "learning_rate": 1.6103677897472794e-05, + "loss": 0.5003, + "num_tokens": 384075218.0, + "step": 2010 + }, + { + "epoch": 0.6877133105802048, + "grad_norm": 0.3287315519317715, + "learning_rate": 1.599435943641368e-05, + "loss": 0.4702, + "num_tokens": 384999949.0, + "step": 2015 + }, + { + "epoch": 0.689419795221843, + "grad_norm": 0.38670376122956074, + "learning_rate": 1.5885407516717987e-05, + "loss": 0.4908, + "num_tokens": 385900887.0, + "step": 2020 + }, + { + "epoch": 0.6911262798634812, + "grad_norm": 0.35663261838972243, + "learning_rate": 1.577682560932547e-05, + "loss": 0.4978, + "num_tokens": 386870114.0, + "step": 2025 + }, + { + "epoch": 0.6928327645051194, + "grad_norm": 0.42642527854196, + "learning_rate": 1.566861717338819e-05, + "loss": 0.4906, + "num_tokens": 387782669.0, + "step": 2030 + }, + { + "epoch": 0.6945392491467577, + "grad_norm": 0.37343767857022897, + "learning_rate": 1.556078565616034e-05, + "loss": 0.4902, + "num_tokens": 388715961.0, + "step": 2035 + }, + { + "epoch": 0.6962457337883959, + "grad_norm": 0.3284331766719929, + "learning_rate": 1.5453334492888428e-05, + "loss": 0.4776, + "num_tokens": 389650899.0, + "step": 2040 + }, + { + "epoch": 0.6979522184300341, + "grad_norm": 0.3483829695120365, + "learning_rate": 1.5346267106701762e-05, + "loss": 0.4836, + "num_tokens": 390610942.0, + "step": 2045 + }, + { + "epoch": 0.6996587030716723, + "grad_norm": 0.32022610770787235, + "learning_rate": 1.5239586908503533e-05, + "loss": 0.5172, + "num_tokens": 391632321.0, + "step": 2050 + }, + { + "epoch": 0.7013651877133106, + "grad_norm": 0.3514714730074271, + "learning_rate": 1.513329729686203e-05, + "loss": 0.4854, + "num_tokens": 392626976.0, + "step": 2055 + }, + { + "epoch": 0.7030716723549488, + "grad_norm": 0.3413532016336787, + "learning_rate": 1.502740165790244e-05, + "loss": 0.4856, + "num_tokens": 393493604.0, + "step": 2060 + }, + { + "epoch": 0.7047781569965871, + "grad_norm": 0.3422857376051543, + "learning_rate": 1.4921903365198914e-05, + "loss": 0.5084, + "num_tokens": 394371570.0, + "step": 2065 + }, + { + "epoch": 0.7064846416382252, + "grad_norm": 0.3572958103887209, + "learning_rate": 1.481680577966717e-05, + "loss": 0.4963, + "num_tokens": 395329185.0, + "step": 2070 + }, + { + "epoch": 0.7081911262798635, + "grad_norm": 0.34758264074134787, + "learning_rate": 1.471211224945736e-05, + "loss": 0.4905, + "num_tokens": 396285000.0, + "step": 2075 + }, + { + "epoch": 0.7098976109215017, + "grad_norm": 0.3419972923917919, + "learning_rate": 1.4607826109847458e-05, + "loss": 0.5266, + "num_tokens": 397224172.0, + "step": 2080 + }, + { + "epoch": 0.71160409556314, + "grad_norm": 0.337893094968607, + "learning_rate": 1.4503950683136936e-05, + "loss": 0.4857, + "num_tokens": 398210109.0, + "step": 2085 + }, + { + "epoch": 0.7133105802047781, + "grad_norm": 0.33901468478209035, + "learning_rate": 1.4400489278540985e-05, + "loss": 0.4749, + "num_tokens": 399158135.0, + "step": 2090 + }, + { + "epoch": 0.7150170648464164, + "grad_norm": 0.3835956412834383, + "learning_rate": 1.429744519208508e-05, + "loss": 0.4936, + "num_tokens": 400075133.0, + "step": 2095 + }, + { + "epoch": 0.7167235494880546, + "grad_norm": 0.34331595173677726, + "learning_rate": 1.4194821706499955e-05, + "loss": 0.5031, + "num_tokens": 400990040.0, + "step": 2100 + }, + { + "epoch": 0.7184300341296929, + "grad_norm": 0.3171937953485873, + "learning_rate": 1.4092622091117041e-05, + "loss": 0.4815, + "num_tokens": 401912436.0, + "step": 2105 + }, + { + "epoch": 0.7201365187713311, + "grad_norm": 0.3410195330730544, + "learning_rate": 1.399084960176431e-05, + "loss": 0.4741, + "num_tokens": 402861165.0, + "step": 2110 + }, + { + "epoch": 0.7218430034129693, + "grad_norm": 0.3465187152384934, + "learning_rate": 1.3889507480662545e-05, + "loss": 0.4913, + "num_tokens": 403763990.0, + "step": 2115 + }, + { + "epoch": 0.7235494880546075, + "grad_norm": 0.3292708548169215, + "learning_rate": 1.3788598956322068e-05, + "loss": 0.4858, + "num_tokens": 404752387.0, + "step": 2120 + }, + { + "epoch": 0.7252559726962458, + "grad_norm": 0.36695144477657526, + "learning_rate": 1.3688127243439863e-05, + "loss": 0.4838, + "num_tokens": 405691554.0, + "step": 2125 + }, + { + "epoch": 0.726962457337884, + "grad_norm": 0.3372811093945545, + "learning_rate": 1.3588095542797186e-05, + "loss": 0.4947, + "num_tokens": 406680793.0, + "step": 2130 + }, + { + "epoch": 0.7286689419795221, + "grad_norm": 0.3443503709289555, + "learning_rate": 1.3488507041157584e-05, + "loss": 0.4921, + "num_tokens": 407683383.0, + "step": 2135 + }, + { + "epoch": 0.7303754266211604, + "grad_norm": 0.3480092507121765, + "learning_rate": 1.3389364911165375e-05, + "loss": 0.4846, + "num_tokens": 408650161.0, + "step": 2140 + }, + { + "epoch": 0.7320819112627986, + "grad_norm": 0.3353926863066775, + "learning_rate": 1.3290672311244584e-05, + "loss": 0.5006, + "num_tokens": 409549494.0, + "step": 2145 + }, + { + "epoch": 0.7337883959044369, + "grad_norm": 0.37867503643842687, + "learning_rate": 1.3192432385498305e-05, + "loss": 0.4921, + "num_tokens": 410510602.0, + "step": 2150 + }, + { + "epoch": 0.735494880546075, + "grad_norm": 0.3592680929736747, + "learning_rate": 1.3094648263608533e-05, + "loss": 0.4981, + "num_tokens": 411492905.0, + "step": 2155 + }, + { + "epoch": 0.7372013651877133, + "grad_norm": 0.3322392394703493, + "learning_rate": 1.299732306073652e-05, + "loss": 0.487, + "num_tokens": 412454003.0, + "step": 2160 + }, + { + "epoch": 0.7389078498293515, + "grad_norm": 0.3756417144546959, + "learning_rate": 1.2900459877423457e-05, + "loss": 0.5106, + "num_tokens": 413421190.0, + "step": 2165 + }, + { + "epoch": 0.7406143344709898, + "grad_norm": 0.3276238813240706, + "learning_rate": 1.2804061799491734e-05, + "loss": 0.4945, + "num_tokens": 414425737.0, + "step": 2170 + }, + { + "epoch": 0.742320819112628, + "grad_norm": 0.29735177345606206, + "learning_rate": 1.2708131897946621e-05, + "loss": 0.478, + "num_tokens": 415344538.0, + "step": 2175 + }, + { + "epoch": 0.7440273037542662, + "grad_norm": 0.3125870087788144, + "learning_rate": 1.261267322887845e-05, + "loss": 0.5041, + "num_tokens": 416440659.0, + "step": 2180 + }, + { + "epoch": 0.7457337883959044, + "grad_norm": 0.32461771570892695, + "learning_rate": 1.251768883336526e-05, + "loss": 0.4919, + "num_tokens": 417360385.0, + "step": 2185 + }, + { + "epoch": 0.7474402730375427, + "grad_norm": 0.3442194296895731, + "learning_rate": 1.2423181737375899e-05, + "loss": 0.4836, + "num_tokens": 418334906.0, + "step": 2190 + }, + { + "epoch": 0.7491467576791809, + "grad_norm": 0.3446911329824469, + "learning_rate": 1.2329154951673598e-05, + "loss": 0.4646, + "num_tokens": 419196059.0, + "step": 2195 + }, + { + "epoch": 0.7508532423208191, + "grad_norm": 0.33767235227444237, + "learning_rate": 1.2235611471720123e-05, + "loss": 0.4856, + "num_tokens": 420121223.0, + "step": 2200 + }, + { + "epoch": 0.7525597269624573, + "grad_norm": 0.31616744714729916, + "learning_rate": 1.2142554277580288e-05, + "loss": 0.4867, + "num_tokens": 421062594.0, + "step": 2205 + }, + { + "epoch": 0.7542662116040956, + "grad_norm": 0.3288211934138168, + "learning_rate": 1.2049986333827048e-05, + "loss": 0.4672, + "num_tokens": 421975487.0, + "step": 2210 + }, + { + "epoch": 0.7559726962457338, + "grad_norm": 0.3496778301287439, + "learning_rate": 1.1957910589447043e-05, + "loss": 0.4861, + "num_tokens": 422820853.0, + "step": 2215 + }, + { + "epoch": 0.757679180887372, + "grad_norm": 0.3475005774602, + "learning_rate": 1.1866329977746656e-05, + "loss": 0.4882, + "num_tokens": 423755589.0, + "step": 2220 + }, + { + "epoch": 0.7593856655290102, + "grad_norm": 0.35706393638603534, + "learning_rate": 1.177524741625856e-05, + "loss": 0.4887, + "num_tokens": 424688821.0, + "step": 2225 + }, + { + "epoch": 0.7610921501706485, + "grad_norm": 0.3461078636788691, + "learning_rate": 1.1684665806648772e-05, + "loss": 0.4684, + "num_tokens": 425585640.0, + "step": 2230 + }, + { + "epoch": 0.7627986348122867, + "grad_norm": 0.3356141138518873, + "learning_rate": 1.1594588034624228e-05, + "loss": 0.4813, + "num_tokens": 426547476.0, + "step": 2235 + }, + { + "epoch": 0.764505119453925, + "grad_norm": 0.3105226360165998, + "learning_rate": 1.1505016969840823e-05, + "loss": 0.4745, + "num_tokens": 427476418.0, + "step": 2240 + }, + { + "epoch": 0.7662116040955631, + "grad_norm": 0.32709156257769095, + "learning_rate": 1.1415955465812023e-05, + "loss": 0.4887, + "num_tokens": 428405822.0, + "step": 2245 + }, + { + "epoch": 0.7679180887372014, + "grad_norm": 0.30756496657170446, + "learning_rate": 1.1327406359817933e-05, + "loss": 0.4774, + "num_tokens": 429400796.0, + "step": 2250 + }, + { + "epoch": 0.7696245733788396, + "grad_norm": 0.3111754417183422, + "learning_rate": 1.1239372472814927e-05, + "loss": 0.4805, + "num_tokens": 430392694.0, + "step": 2255 + }, + { + "epoch": 0.7713310580204779, + "grad_norm": 0.34095902728307337, + "learning_rate": 1.1151856609345774e-05, + "loss": 0.4716, + "num_tokens": 431359520.0, + "step": 2260 + }, + { + "epoch": 0.773037542662116, + "grad_norm": 0.34936226176674845, + "learning_rate": 1.1064861557450256e-05, + "loss": 0.4894, + "num_tokens": 432294915.0, + "step": 2265 + }, + { + "epoch": 0.7747440273037542, + "grad_norm": 0.3155937127951743, + "learning_rate": 1.0978390088576437e-05, + "loss": 0.481, + "num_tokens": 433284774.0, + "step": 2270 + }, + { + "epoch": 0.7764505119453925, + "grad_norm": 0.3202827342005119, + "learning_rate": 1.0892444957492276e-05, + "loss": 0.4891, + "num_tokens": 434284592.0, + "step": 2275 + }, + { + "epoch": 0.7781569965870307, + "grad_norm": 0.3369162033659964, + "learning_rate": 1.0807028902197925e-05, + "loss": 0.4654, + "num_tokens": 435149765.0, + "step": 2280 + }, + { + "epoch": 0.7798634812286689, + "grad_norm": 0.3582446923094639, + "learning_rate": 1.0722144643838461e-05, + "loss": 0.4866, + "num_tokens": 436158148.0, + "step": 2285 + }, + { + "epoch": 0.7815699658703071, + "grad_norm": 0.3311676374288052, + "learning_rate": 1.063779488661724e-05, + "loss": 0.4776, + "num_tokens": 437135437.0, + "step": 2290 + }, + { + "epoch": 0.7832764505119454, + "grad_norm": 0.32325241567650026, + "learning_rate": 1.0553982317709741e-05, + "loss": 0.4654, + "num_tokens": 438061307.0, + "step": 2295 + }, + { + "epoch": 0.7849829351535836, + "grad_norm": 0.3043727173598455, + "learning_rate": 1.047070960717793e-05, + "loss": 0.4932, + "num_tokens": 439079263.0, + "step": 2300 + }, + { + "epoch": 0.7866894197952219, + "grad_norm": 0.3361341921228823, + "learning_rate": 1.0387979407885198e-05, + "loss": 0.506, + "num_tokens": 440154096.0, + "step": 2305 + }, + { + "epoch": 0.78839590443686, + "grad_norm": 0.31103435909845967, + "learning_rate": 1.03057943554119e-05, + "loss": 0.4848, + "num_tokens": 441195758.0, + "step": 2310 + }, + { + "epoch": 0.7901023890784983, + "grad_norm": 0.32529003714436683, + "learning_rate": 1.022415706797133e-05, + "loss": 0.4941, + "num_tokens": 442194379.0, + "step": 2315 + }, + { + "epoch": 0.7918088737201365, + "grad_norm": 0.34129914389197974, + "learning_rate": 1.0143070146326347e-05, + "loss": 0.4965, + "num_tokens": 443118717.0, + "step": 2320 + }, + { + "epoch": 0.7935153583617748, + "grad_norm": 0.34574447344569287, + "learning_rate": 1.0062536173706519e-05, + "loss": 0.4833, + "num_tokens": 444049001.0, + "step": 2325 + }, + { + "epoch": 0.7952218430034129, + "grad_norm": 0.39058472684776835, + "learning_rate": 9.982557715725807e-06, + "loss": 0.4855, + "num_tokens": 444948197.0, + "step": 2330 + }, + { + "epoch": 0.7969283276450512, + "grad_norm": 0.3223678497972808, + "learning_rate": 9.903137320300852e-06, + "loss": 0.4923, + "num_tokens": 445993006.0, + "step": 2335 + }, + { + "epoch": 0.7986348122866894, + "grad_norm": 0.3581110760327946, + "learning_rate": 9.824277517569791e-06, + "loss": 0.4714, + "num_tokens": 446925677.0, + "step": 2340 + }, + { + "epoch": 0.8003412969283277, + "grad_norm": 0.35940172044204516, + "learning_rate": 9.745980819811668e-06, + "loss": 0.4838, + "num_tokens": 447799196.0, + "step": 2345 + }, + { + "epoch": 0.8020477815699659, + "grad_norm": 0.34500268650073257, + "learning_rate": 9.66824972136638e-06, + "loss": 0.493, + "num_tokens": 448739177.0, + "step": 2350 + }, + { + "epoch": 0.8037542662116041, + "grad_norm": 0.32399229631376597, + "learning_rate": 9.59108669855523e-06, + "loss": 0.5037, + "num_tokens": 449751958.0, + "step": 2355 + }, + { + "epoch": 0.8054607508532423, + "grad_norm": 0.29766759242151386, + "learning_rate": 9.514494209602023e-06, + "loss": 0.5071, + "num_tokens": 450761568.0, + "step": 2360 + }, + { + "epoch": 0.8071672354948806, + "grad_norm": 0.3430178270787923, + "learning_rate": 9.438474694554775e-06, + "loss": 0.4935, + "num_tokens": 451740507.0, + "step": 2365 + }, + { + "epoch": 0.8088737201365188, + "grad_norm": 0.3471410052900583, + "learning_rate": 9.36303057520795e-06, + "loss": 0.4713, + "num_tokens": 452625293.0, + "step": 2370 + }, + { + "epoch": 0.810580204778157, + "grad_norm": 0.3493958783029142, + "learning_rate": 9.288164255025334e-06, + "loss": 0.4823, + "num_tokens": 453642012.0, + "step": 2375 + }, + { + "epoch": 0.8122866894197952, + "grad_norm": 0.33853142356519106, + "learning_rate": 9.21387811906344e-06, + "loss": 0.4823, + "num_tokens": 454558803.0, + "step": 2380 + }, + { + "epoch": 0.8139931740614335, + "grad_norm": 0.3227168370953717, + "learning_rate": 9.14017453389556e-06, + "loss": 0.476, + "num_tokens": 455523392.0, + "step": 2385 + }, + { + "epoch": 0.8156996587030717, + "grad_norm": 0.29404894296430983, + "learning_rate": 9.067055847536346e-06, + "loss": 0.4596, + "num_tokens": 456494011.0, + "step": 2390 + }, + { + "epoch": 0.8174061433447098, + "grad_norm": 0.3308718804211504, + "learning_rate": 8.994524389367001e-06, + "loss": 0.4891, + "num_tokens": 457401137.0, + "step": 2395 + }, + { + "epoch": 0.8191126279863481, + "grad_norm": 0.3300071384128006, + "learning_rate": 8.922582470061099e-06, + "loss": 0.4961, + "num_tokens": 458401399.0, + "step": 2400 + }, + { + "epoch": 0.8208191126279863, + "grad_norm": 0.44818729004437646, + "learning_rate": 8.851232381510961e-06, + "loss": 0.504, + "num_tokens": 459365515.0, + "step": 2405 + }, + { + "epoch": 0.8225255972696246, + "grad_norm": 0.3842623998501234, + "learning_rate": 8.780476396754633e-06, + "loss": 0.4931, + "num_tokens": 460303546.0, + "step": 2410 + }, + { + "epoch": 0.8242320819112628, + "grad_norm": 0.33418576004104283, + "learning_rate": 8.710316769903471e-06, + "loss": 0.4868, + "num_tokens": 461172152.0, + "step": 2415 + }, + { + "epoch": 0.825938566552901, + "grad_norm": 0.368646183750318, + "learning_rate": 8.640755736070346e-06, + "loss": 0.4579, + "num_tokens": 462132037.0, + "step": 2420 + }, + { + "epoch": 0.8276450511945392, + "grad_norm": 0.32508360537240183, + "learning_rate": 8.571795511298423e-06, + "loss": 0.4853, + "num_tokens": 463170048.0, + "step": 2425 + }, + { + "epoch": 0.8293515358361775, + "grad_norm": 0.3211106939852037, + "learning_rate": 8.50343829249059e-06, + "loss": 0.4593, + "num_tokens": 464065062.0, + "step": 2430 + }, + { + "epoch": 0.8310580204778157, + "grad_norm": 0.3053006101651597, + "learning_rate": 8.435686257339417e-06, + "loss": 0.4831, + "num_tokens": 465056306.0, + "step": 2435 + }, + { + "epoch": 0.8327645051194539, + "grad_norm": 0.31115117178982893, + "learning_rate": 8.368541564257842e-06, + "loss": 0.4907, + "num_tokens": 466050672.0, + "step": 2440 + }, + { + "epoch": 0.8344709897610921, + "grad_norm": 0.3397295723653259, + "learning_rate": 8.302006352310369e-06, + "loss": 0.4966, + "num_tokens": 467046976.0, + "step": 2445 + }, + { + "epoch": 0.8361774744027304, + "grad_norm": 0.3382052634070017, + "learning_rate": 8.236082741144938e-06, + "loss": 0.4638, + "num_tokens": 468039326.0, + "step": 2450 + }, + { + "epoch": 0.8378839590443686, + "grad_norm": 0.3514456973966274, + "learning_rate": 8.170772830925389e-06, + "loss": 0.4653, + "num_tokens": 468922373.0, + "step": 2455 + }, + { + "epoch": 0.8395904436860068, + "grad_norm": 0.30836591986791617, + "learning_rate": 8.106078702264573e-06, + "loss": 0.4829, + "num_tokens": 469868923.0, + "step": 2460 + }, + { + "epoch": 0.841296928327645, + "grad_norm": 0.33389876659214884, + "learning_rate": 8.042002416158047e-06, + "loss": 0.471, + "num_tokens": 470752870.0, + "step": 2465 + }, + { + "epoch": 0.8430034129692833, + "grad_norm": 0.34528058311355864, + "learning_rate": 7.978546013918428e-06, + "loss": 0.4806, + "num_tokens": 471694644.0, + "step": 2470 + }, + { + "epoch": 0.8447098976109215, + "grad_norm": 0.30449852181321135, + "learning_rate": 7.915711517110365e-06, + "loss": 0.4726, + "num_tokens": 472652423.0, + "step": 2475 + }, + { + "epoch": 0.8464163822525598, + "grad_norm": 0.3010535970843912, + "learning_rate": 7.853500927486129e-06, + "loss": 0.4734, + "num_tokens": 473648633.0, + "step": 2480 + }, + { + "epoch": 0.8481228668941979, + "grad_norm": 0.319770912075749, + "learning_rate": 7.791916226921844e-06, + "loss": 0.493, + "num_tokens": 474686021.0, + "step": 2485 + }, + { + "epoch": 0.8498293515358362, + "grad_norm": 0.3101950532253656, + "learning_rate": 7.730959377354354e-06, + "loss": 0.4811, + "num_tokens": 475597050.0, + "step": 2490 + }, + { + "epoch": 0.8515358361774744, + "grad_norm": 0.336474137392747, + "learning_rate": 7.670632320718714e-06, + "loss": 0.4985, + "num_tokens": 476480863.0, + "step": 2495 + }, + { + "epoch": 0.8532423208191127, + "grad_norm": 0.33239250078197224, + "learning_rate": 7.610936978886332e-06, + "loss": 0.4889, + "num_tokens": 477480036.0, + "step": 2500 + }, + { + "epoch": 0.8549488054607508, + "grad_norm": 0.31906385732968195, + "learning_rate": 7.551875253603726e-06, + "loss": 0.4913, + "num_tokens": 478441727.0, + "step": 2505 + }, + { + "epoch": 0.856655290102389, + "grad_norm": 0.2972232303527529, + "learning_rate": 7.493449026431963e-06, + "loss": 0.4846, + "num_tokens": 479450987.0, + "step": 2510 + }, + { + "epoch": 0.8583617747440273, + "grad_norm": 0.34482331352077483, + "learning_rate": 7.4356601586867094e-06, + "loss": 0.4872, + "num_tokens": 480480087.0, + "step": 2515 + }, + { + "epoch": 0.8600682593856656, + "grad_norm": 0.3286565488200801, + "learning_rate": 7.3785104913789284e-06, + "loss": 0.493, + "num_tokens": 481428209.0, + "step": 2520 + }, + { + "epoch": 0.8617747440273038, + "grad_norm": 0.3202404807748421, + "learning_rate": 7.322001845156215e-06, + "loss": 0.4634, + "num_tokens": 482333846.0, + "step": 2525 + }, + { + "epoch": 0.863481228668942, + "grad_norm": 0.3361830515481688, + "learning_rate": 7.2661360202448344e-06, + "loss": 0.4904, + "num_tokens": 483299671.0, + "step": 2530 + }, + { + "epoch": 0.8651877133105802, + "grad_norm": 0.3177624111695824, + "learning_rate": 7.2109147963923335e-06, + "loss": 0.4988, + "num_tokens": 484313085.0, + "step": 2535 + }, + { + "epoch": 0.8668941979522184, + "grad_norm": 0.32624228168571706, + "learning_rate": 7.156339932810871e-06, + "loss": 0.4968, + "num_tokens": 485251856.0, + "step": 2540 + }, + { + "epoch": 0.8686006825938567, + "grad_norm": 0.2897757010993598, + "learning_rate": 7.1024131681211455e-06, + "loss": 0.4779, + "num_tokens": 486246381.0, + "step": 2545 + }, + { + "epoch": 0.8703071672354948, + "grad_norm": 0.3120988592440139, + "learning_rate": 7.0491362202970295e-06, + "loss": 0.4712, + "num_tokens": 487198446.0, + "step": 2550 + }, + { + "epoch": 0.8720136518771331, + "grad_norm": 0.2815904159190546, + "learning_rate": 6.9965107866108274e-06, + "loss": 0.4722, + "num_tokens": 488156403.0, + "step": 2555 + }, + { + "epoch": 0.8737201365187713, + "grad_norm": 0.3273103504291178, + "learning_rate": 6.9445385435792095e-06, + "loss": 0.4695, + "num_tokens": 489140124.0, + "step": 2560 + }, + { + "epoch": 0.8754266211604096, + "grad_norm": 0.3289263665204276, + "learning_rate": 6.893221146909806e-06, + "loss": 0.4724, + "num_tokens": 490104565.0, + "step": 2565 + }, + { + "epoch": 0.8771331058020477, + "grad_norm": 0.3146346775894274, + "learning_rate": 6.84256023144845e-06, + "loss": 0.4762, + "num_tokens": 491054868.0, + "step": 2570 + }, + { + "epoch": 0.878839590443686, + "grad_norm": 0.29961414202911324, + "learning_rate": 6.792557411127099e-06, + "loss": 0.4704, + "num_tokens": 492078546.0, + "step": 2575 + }, + { + "epoch": 0.8805460750853242, + "grad_norm": 0.34594580055469915, + "learning_rate": 6.74321427891242e-06, + "loss": 0.4851, + "num_tokens": 492974236.0, + "step": 2580 + }, + { + "epoch": 0.8822525597269625, + "grad_norm": 0.3066722562434767, + "learning_rate": 6.694532406755053e-06, + "loss": 0.478, + "num_tokens": 494019470.0, + "step": 2585 + }, + { + "epoch": 0.8839590443686007, + "grad_norm": 0.2923055504113335, + "learning_rate": 6.646513345539509e-06, + "loss": 0.516, + "num_tokens": 495062198.0, + "step": 2590 + }, + { + "epoch": 0.8856655290102389, + "grad_norm": 0.2953358109985866, + "learning_rate": 6.59915862503478e-06, + "loss": 0.4668, + "num_tokens": 496039347.0, + "step": 2595 + }, + { + "epoch": 0.8873720136518771, + "grad_norm": 0.33489720113078275, + "learning_rate": 6.552469753845601e-06, + "loss": 0.4715, + "num_tokens": 496987511.0, + "step": 2600 + }, + { + "epoch": 0.8890784982935154, + "grad_norm": 0.35147052248086963, + "learning_rate": 6.506448219364389e-06, + "loss": 0.4952, + "num_tokens": 497953501.0, + "step": 2605 + }, + { + "epoch": 0.8907849829351536, + "grad_norm": 0.2976052092258336, + "learning_rate": 6.461095487723852e-06, + "loss": 0.4703, + "num_tokens": 498971917.0, + "step": 2610 + }, + { + "epoch": 0.8924914675767918, + "grad_norm": 0.3166781793747034, + "learning_rate": 6.416413003750289e-06, + "loss": 0.4765, + "num_tokens": 499959465.0, + "step": 2615 + }, + { + "epoch": 0.89419795221843, + "grad_norm": 0.3094928641877226, + "learning_rate": 6.3724021909175636e-06, + "loss": 0.4714, + "num_tokens": 500947010.0, + "step": 2620 + }, + { + "epoch": 0.8959044368600683, + "grad_norm": 0.3403135875166433, + "learning_rate": 6.3290644513017496e-06, + "loss": 0.4838, + "num_tokens": 501972930.0, + "step": 2625 + }, + { + "epoch": 0.8976109215017065, + "grad_norm": 0.35528577423877566, + "learning_rate": 6.286401165536466e-06, + "loss": 0.4974, + "num_tokens": 502958987.0, + "step": 2630 + }, + { + "epoch": 0.8993174061433447, + "grad_norm": 0.3318258094487176, + "learning_rate": 6.244413692768893e-06, + "loss": 0.4767, + "num_tokens": 503946765.0, + "step": 2635 + }, + { + "epoch": 0.9010238907849829, + "grad_norm": 0.30577807197719564, + "learning_rate": 6.2031033706164715e-06, + "loss": 0.471, + "num_tokens": 504893463.0, + "step": 2640 + }, + { + "epoch": 0.9027303754266212, + "grad_norm": 0.3384112726283077, + "learning_rate": 6.162471515124292e-06, + "loss": 0.481, + "num_tokens": 505899175.0, + "step": 2645 + }, + { + "epoch": 0.9044368600682594, + "grad_norm": 0.30699959739486854, + "learning_rate": 6.122519420723182e-06, + "loss": 0.4733, + "num_tokens": 506861395.0, + "step": 2650 + }, + { + "epoch": 0.9061433447098977, + "grad_norm": 0.3222784803973646, + "learning_rate": 6.083248360188437e-06, + "loss": 0.4825, + "num_tokens": 507817589.0, + "step": 2655 + }, + { + "epoch": 0.9078498293515358, + "grad_norm": 0.3403142746810214, + "learning_rate": 6.044659584599297e-06, + "loss": 0.4761, + "num_tokens": 508690960.0, + "step": 2660 + }, + { + "epoch": 0.909556313993174, + "grad_norm": 0.2953479643563658, + "learning_rate": 6.006754323299088e-06, + "loss": 0.4804, + "num_tokens": 509649233.0, + "step": 2665 + }, + { + "epoch": 0.9112627986348123, + "grad_norm": 0.30487020552378524, + "learning_rate": 5.969533783856054e-06, + "loss": 0.4777, + "num_tokens": 510627944.0, + "step": 2670 + }, + { + "epoch": 0.9129692832764505, + "grad_norm": 0.30704549371766876, + "learning_rate": 5.932999152024885e-06, + "loss": 0.4822, + "num_tokens": 511591407.0, + "step": 2675 + }, + { + "epoch": 0.9146757679180887, + "grad_norm": 0.33249148261118033, + "learning_rate": 5.897151591708947e-06, + "loss": 0.5016, + "num_tokens": 512558436.0, + "step": 2680 + }, + { + "epoch": 0.9163822525597269, + "grad_norm": 0.3229296450202934, + "learning_rate": 5.861992244923199e-06, + "loss": 0.4735, + "num_tokens": 513474763.0, + "step": 2685 + }, + { + "epoch": 0.9180887372013652, + "grad_norm": 0.31994250178502454, + "learning_rate": 5.827522231757808e-06, + "loss": 0.4609, + "num_tokens": 514407245.0, + "step": 2690 + }, + { + "epoch": 0.9197952218430034, + "grad_norm": 0.30727719477408283, + "learning_rate": 5.793742650342482e-06, + "loss": 0.4611, + "num_tokens": 515337057.0, + "step": 2695 + }, + { + "epoch": 0.9215017064846417, + "grad_norm": 0.3422479555475027, + "learning_rate": 5.760654576811455e-06, + "loss": 0.5085, + "num_tokens": 516301089.0, + "step": 2700 + }, + { + "epoch": 0.9232081911262798, + "grad_norm": 0.32317795820730566, + "learning_rate": 5.728259065269248e-06, + "loss": 0.4808, + "num_tokens": 517258131.0, + "step": 2705 + }, + { + "epoch": 0.9249146757679181, + "grad_norm": 0.31035185685822675, + "learning_rate": 5.696557147757041e-06, + "loss": 0.4989, + "num_tokens": 518223298.0, + "step": 2710 + }, + { + "epoch": 0.9266211604095563, + "grad_norm": 0.32127700006025317, + "learning_rate": 5.66554983421983e-06, + "loss": 0.4721, + "num_tokens": 519130985.0, + "step": 2715 + }, + { + "epoch": 0.9283276450511946, + "grad_norm": 0.3248191734813711, + "learning_rate": 5.635238112474237e-06, + "loss": 0.4878, + "num_tokens": 520051962.0, + "step": 2720 + }, + { + "epoch": 0.9300341296928327, + "grad_norm": 0.32016262129429485, + "learning_rate": 5.605622948177032e-06, + "loss": 0.4612, + "num_tokens": 520934447.0, + "step": 2725 + }, + { + "epoch": 0.931740614334471, + "grad_norm": 0.3075215750119489, + "learning_rate": 5.576705284794404e-06, + "loss": 0.4717, + "num_tokens": 521910187.0, + "step": 2730 + }, + { + "epoch": 0.9334470989761092, + "grad_norm": 0.2966943509184171, + "learning_rate": 5.548486043571861e-06, + "loss": 0.4615, + "num_tokens": 522876883.0, + "step": 2735 + }, + { + "epoch": 0.9351535836177475, + "grad_norm": 0.3823699167375232, + "learning_rate": 5.52096612350491e-06, + "loss": 0.4899, + "num_tokens": 523880084.0, + "step": 2740 + }, + { + "epoch": 0.9368600682593856, + "grad_norm": 0.3445674466297448, + "learning_rate": 5.494146401310404e-06, + "loss": 0.4792, + "num_tokens": 524788350.0, + "step": 2745 + }, + { + "epoch": 0.9385665529010239, + "grad_norm": 0.29579943008870063, + "learning_rate": 5.468027731398621e-06, + "loss": 0.4863, + "num_tokens": 525832920.0, + "step": 2750 + }, + { + "epoch": 0.9402730375426621, + "grad_norm": 0.30558147446901934, + "learning_rate": 5.442610945846045e-06, + "loss": 0.4943, + "num_tokens": 526845340.0, + "step": 2755 + }, + { + "epoch": 0.9419795221843004, + "grad_norm": 0.32100249042860457, + "learning_rate": 5.41789685436884e-06, + "loss": 0.4788, + "num_tokens": 527753736.0, + "step": 2760 + }, + { + "epoch": 0.9436860068259386, + "grad_norm": 0.32004490139915764, + "learning_rate": 5.393886244297079e-06, + "loss": 0.4817, + "num_tokens": 528798665.0, + "step": 2765 + }, + { + "epoch": 0.9453924914675768, + "grad_norm": 0.3325347960685324, + "learning_rate": 5.370579880549647e-06, + "loss": 0.4878, + "num_tokens": 529711197.0, + "step": 2770 + }, + { + "epoch": 0.947098976109215, + "grad_norm": 0.3303511181497015, + "learning_rate": 5.347978505609877e-06, + "loss": 0.4693, + "num_tokens": 530632318.0, + "step": 2775 + }, + { + "epoch": 0.9488054607508533, + "grad_norm": 0.3044920349269044, + "learning_rate": 5.326082839501891e-06, + "loss": 0.4881, + "num_tokens": 531553862.0, + "step": 2780 + }, + { + "epoch": 0.9505119453924915, + "grad_norm": 0.30326530639070487, + "learning_rate": 5.304893579767674e-06, + "loss": 0.4935, + "num_tokens": 532532990.0, + "step": 2785 + }, + { + "epoch": 0.9522184300341296, + "grad_norm": 0.31251416953382594, + "learning_rate": 5.284411401444836e-06, + "loss": 0.4933, + "num_tokens": 533448215.0, + "step": 2790 + }, + { + "epoch": 0.9539249146757679, + "grad_norm": 0.32753454088259387, + "learning_rate": 5.264636957045122e-06, + "loss": 0.4824, + "num_tokens": 534401740.0, + "step": 2795 + }, + { + "epoch": 0.9556313993174061, + "grad_norm": 0.30791599389681806, + "learning_rate": 5.245570876533615e-06, + "loss": 0.4685, + "num_tokens": 535341346.0, + "step": 2800 + }, + { + "epoch": 0.9573378839590444, + "grad_norm": 0.313479264368028, + "learning_rate": 5.227213767308668e-06, + "loss": 0.4575, + "num_tokens": 536296941.0, + "step": 2805 + }, + { + "epoch": 0.9590443686006825, + "grad_norm": 0.29032270836531787, + "learning_rate": 5.209566214182558e-06, + "loss": 0.4742, + "num_tokens": 537336227.0, + "step": 2810 + }, + { + "epoch": 0.9607508532423208, + "grad_norm": 0.30878749649756715, + "learning_rate": 5.1926287793628515e-06, + "loss": 0.4843, + "num_tokens": 538331668.0, + "step": 2815 + }, + { + "epoch": 0.962457337883959, + "grad_norm": 0.3144044223958235, + "learning_rate": 5.176402002434495e-06, + "loss": 0.4596, + "num_tokens": 539226192.0, + "step": 2820 + }, + { + "epoch": 0.9641638225255973, + "grad_norm": 0.33929109387452877, + "learning_rate": 5.1608864003426255e-06, + "loss": 0.4783, + "num_tokens": 540188216.0, + "step": 2825 + }, + { + "epoch": 0.9658703071672355, + "grad_norm": 0.3147575010798021, + "learning_rate": 5.146082467376103e-06, + "loss": 0.4742, + "num_tokens": 541043880.0, + "step": 2830 + }, + { + "epoch": 0.9675767918088737, + "grad_norm": 0.29692914861077196, + "learning_rate": 5.131990675151757e-06, + "loss": 0.4915, + "num_tokens": 541979505.0, + "step": 2835 + }, + { + "epoch": 0.9692832764505119, + "grad_norm": 0.3233760089537912, + "learning_rate": 5.1186114725993754e-06, + "loss": 0.4741, + "num_tokens": 542913487.0, + "step": 2840 + }, + { + "epoch": 0.9709897610921502, + "grad_norm": 0.3243453736091082, + "learning_rate": 5.105945285947394e-06, + "loss": 0.4856, + "num_tokens": 543888926.0, + "step": 2845 + }, + { + "epoch": 0.9726962457337884, + "grad_norm": 0.34332556463465, + "learning_rate": 5.09399251870931e-06, + "loss": 0.5042, + "num_tokens": 544851416.0, + "step": 2850 + }, + { + "epoch": 0.9744027303754266, + "grad_norm": 0.35805837088581816, + "learning_rate": 5.082753551670843e-06, + "loss": 0.4832, + "num_tokens": 545757943.0, + "step": 2855 + }, + { + "epoch": 0.9761092150170648, + "grad_norm": 0.3330936930767059, + "learning_rate": 5.072228742877796e-06, + "loss": 0.4861, + "num_tokens": 546668069.0, + "step": 2860 + }, + { + "epoch": 0.9778156996587031, + "grad_norm": 0.2996095611574381, + "learning_rate": 5.062418427624646e-06, + "loss": 0.4706, + "num_tokens": 547620964.0, + "step": 2865 + }, + { + "epoch": 0.9795221843003413, + "grad_norm": 0.3334623473830669, + "learning_rate": 5.053322918443873e-06, + "loss": 0.4815, + "num_tokens": 548580998.0, + "step": 2870 + }, + { + "epoch": 0.9812286689419796, + "grad_norm": 0.3134805266587632, + "learning_rate": 5.0449425050959876e-06, + "loss": 0.49, + "num_tokens": 549511852.0, + "step": 2875 + }, + { + "epoch": 0.9829351535836177, + "grad_norm": 0.3309875762519481, + "learning_rate": 5.0372774545603155e-06, + "loss": 0.4617, + "num_tokens": 550468465.0, + "step": 2880 + }, + { + "epoch": 0.984641638225256, + "grad_norm": 0.3215047597552604, + "learning_rate": 5.0303280110264825e-06, + "loss": 0.4681, + "num_tokens": 551376796.0, + "step": 2885 + }, + { + "epoch": 0.9863481228668942, + "grad_norm": 0.45537977132918783, + "learning_rate": 5.02409439588664e-06, + "loss": 0.4914, + "num_tokens": 552397091.0, + "step": 2890 + }, + { + "epoch": 0.9880546075085325, + "grad_norm": 0.30892367299813056, + "learning_rate": 5.018576807728409e-06, + "loss": 0.4632, + "num_tokens": 553378344.0, + "step": 2895 + }, + { + "epoch": 0.9897610921501706, + "grad_norm": 0.32443914024425874, + "learning_rate": 5.013775422328553e-06, + "loss": 0.466, + "num_tokens": 554315309.0, + "step": 2900 + }, + { + "epoch": 0.9914675767918089, + "grad_norm": 0.3310428027199226, + "learning_rate": 5.0096903926473885e-06, + "loss": 0.4724, + "num_tokens": 555217376.0, + "step": 2905 + }, + { + "epoch": 0.9931740614334471, + "grad_norm": 0.3211087070464618, + "learning_rate": 5.00632184882389e-06, + "loss": 0.4991, + "num_tokens": 556147373.0, + "step": 2910 + }, + { + "epoch": 0.9948805460750854, + "grad_norm": 0.3344876290335765, + "learning_rate": 5.00366989817157e-06, + "loss": 0.4827, + "num_tokens": 557076223.0, + "step": 2915 + }, + { + "epoch": 0.9965870307167235, + "grad_norm": 0.32216989394760803, + "learning_rate": 5.0017346251750415e-06, + "loss": 0.4646, + "num_tokens": 558093284.0, + "step": 2920 + }, + { + "epoch": 0.9982935153583617, + "grad_norm": 0.3263881006745613, + "learning_rate": 5.000516091487337e-06, + "loss": 0.4751, + "num_tokens": 558997916.0, + "step": 2925 + }, + { + "epoch": 1.0, + "grad_norm": 0.3026241946213063, + "learning_rate": 5.00001433592793e-06, + "loss": 0.4825, + "num_tokens": 560009809.0, + "step": 2930 + }, + { + "epoch": 1.0, + "step": 2930, + "total_flos": 1146719603261440.0, + "train_loss": 0.5200651722149637, + "train_runtime": 20826.6168, + "train_samples_per_second": 4.501, + "train_steps_per_second": 0.141 + } + ], + "logging_steps": 5, + "max_steps": 2930, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1146719603261440.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}