OpenR1-Qwen-7B-French / trainer_state.json
bezir's picture
Model save
502ce47 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9994982438534872,
"eval_steps": 100,
"global_step": 2490,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004014049172102358,
"grad_norm": 6.058766803882945,
"learning_rate": 2.0080321285140563e-07,
"loss": 1.1175,
"mean_token_accuracy": 0.7435387402772904,
"step": 5
},
{
"epoch": 0.008028098344204716,
"grad_norm": 5.776390774267812,
"learning_rate": 4.0160642570281125e-07,
"loss": 1.0984,
"mean_token_accuracy": 0.7462003320455551,
"step": 10
},
{
"epoch": 0.012042147516307075,
"grad_norm": 4.964372817070418,
"learning_rate": 6.024096385542169e-07,
"loss": 1.1026,
"mean_token_accuracy": 0.7443804442882538,
"step": 15
},
{
"epoch": 0.016056196688409432,
"grad_norm": 3.8205027028573406,
"learning_rate": 8.032128514056225e-07,
"loss": 1.0507,
"mean_token_accuracy": 0.7505107507109642,
"step": 20
},
{
"epoch": 0.02007024586051179,
"grad_norm": 2.2377857535427106,
"learning_rate": 1.0040160642570282e-06,
"loss": 0.9846,
"mean_token_accuracy": 0.7566263154149055,
"step": 25
},
{
"epoch": 0.02408429503261415,
"grad_norm": 2.019284032849979,
"learning_rate": 1.2048192771084338e-06,
"loss": 0.9575,
"mean_token_accuracy": 0.7592063814401626,
"step": 30
},
{
"epoch": 0.02809834420471651,
"grad_norm": 1.4724634441521711,
"learning_rate": 1.4056224899598394e-06,
"loss": 0.8998,
"mean_token_accuracy": 0.7671008065342904,
"step": 35
},
{
"epoch": 0.032112393376818864,
"grad_norm": 1.1460238128354983,
"learning_rate": 1.606425702811245e-06,
"loss": 0.8622,
"mean_token_accuracy": 0.774419629573822,
"step": 40
},
{
"epoch": 0.03612644254892122,
"grad_norm": 0.8962017383164974,
"learning_rate": 1.8072289156626508e-06,
"loss": 0.8274,
"mean_token_accuracy": 0.7802118778228759,
"step": 45
},
{
"epoch": 0.04014049172102358,
"grad_norm": 0.812855071257074,
"learning_rate": 2.0080321285140564e-06,
"loss": 0.7938,
"mean_token_accuracy": 0.7862023413181305,
"step": 50
},
{
"epoch": 0.04415454089312594,
"grad_norm": 0.70094482944081,
"learning_rate": 2.2088353413654622e-06,
"loss": 0.7875,
"mean_token_accuracy": 0.7867150768637657,
"step": 55
},
{
"epoch": 0.0481685900652283,
"grad_norm": 0.6178275403403997,
"learning_rate": 2.4096385542168676e-06,
"loss": 0.7622,
"mean_token_accuracy": 0.7920725375413895,
"step": 60
},
{
"epoch": 0.05218263923733066,
"grad_norm": 0.6043428588395752,
"learning_rate": 2.6104417670682734e-06,
"loss": 0.7431,
"mean_token_accuracy": 0.7964029759168625,
"step": 65
},
{
"epoch": 0.05619668840943302,
"grad_norm": 0.575539588575506,
"learning_rate": 2.811244979919679e-06,
"loss": 0.7275,
"mean_token_accuracy": 0.79924486130476,
"step": 70
},
{
"epoch": 0.060210737581535376,
"grad_norm": 0.580001753536299,
"learning_rate": 3.012048192771085e-06,
"loss": 0.7098,
"mean_token_accuracy": 0.8032902508974076,
"step": 75
},
{
"epoch": 0.06422478675363773,
"grad_norm": 0.5522658470324157,
"learning_rate": 3.21285140562249e-06,
"loss": 0.6998,
"mean_token_accuracy": 0.8053625896573067,
"step": 80
},
{
"epoch": 0.0682388359257401,
"grad_norm": 0.5299497087282212,
"learning_rate": 3.4136546184738962e-06,
"loss": 0.7,
"mean_token_accuracy": 0.8049216270446777,
"step": 85
},
{
"epoch": 0.07225288509784245,
"grad_norm": 0.5510228635994878,
"learning_rate": 3.6144578313253016e-06,
"loss": 0.7066,
"mean_token_accuracy": 0.803421251475811,
"step": 90
},
{
"epoch": 0.07626693426994481,
"grad_norm": 0.5707453842809624,
"learning_rate": 3.8152610441767074e-06,
"loss": 0.685,
"mean_token_accuracy": 0.8084254205226898,
"step": 95
},
{
"epoch": 0.08028098344204716,
"grad_norm": 0.6177649426118329,
"learning_rate": 4.016064257028113e-06,
"loss": 0.6964,
"mean_token_accuracy": 0.8053169295191764,
"step": 100
},
{
"epoch": 0.08429503261414953,
"grad_norm": 0.5690454870209587,
"learning_rate": 4.216867469879519e-06,
"loss": 0.687,
"mean_token_accuracy": 0.8071819305419922,
"step": 105
},
{
"epoch": 0.08830908178625188,
"grad_norm": 0.6164411988135075,
"learning_rate": 4.4176706827309244e-06,
"loss": 0.6607,
"mean_token_accuracy": 0.8136294975876808,
"step": 110
},
{
"epoch": 0.09232313095835425,
"grad_norm": 0.553614457840363,
"learning_rate": 4.61847389558233e-06,
"loss": 0.6623,
"mean_token_accuracy": 0.8132717654109001,
"step": 115
},
{
"epoch": 0.0963371801304566,
"grad_norm": 0.5422023366195315,
"learning_rate": 4.819277108433735e-06,
"loss": 0.6721,
"mean_token_accuracy": 0.8104784831404686,
"step": 120
},
{
"epoch": 0.10035122930255895,
"grad_norm": 0.5133572915810863,
"learning_rate": 5.0200803212851415e-06,
"loss": 0.6645,
"mean_token_accuracy": 0.8119137555360794,
"step": 125
},
{
"epoch": 0.10436527847466132,
"grad_norm": 0.58503647555332,
"learning_rate": 5.220883534136547e-06,
"loss": 0.6635,
"mean_token_accuracy": 0.8120792865753174,
"step": 130
},
{
"epoch": 0.10837932764676367,
"grad_norm": 0.5684351953572544,
"learning_rate": 5.421686746987952e-06,
"loss": 0.6505,
"mean_token_accuracy": 0.8155466809868812,
"step": 135
},
{
"epoch": 0.11239337681886603,
"grad_norm": 0.5447677613420705,
"learning_rate": 5.622489959839358e-06,
"loss": 0.6449,
"mean_token_accuracy": 0.8165045753121376,
"step": 140
},
{
"epoch": 0.11640742599096839,
"grad_norm": 0.6432560163866337,
"learning_rate": 5.823293172690764e-06,
"loss": 0.647,
"mean_token_accuracy": 0.8159680441021919,
"step": 145
},
{
"epoch": 0.12042147516307075,
"grad_norm": 0.5878776869535187,
"learning_rate": 6.02409638554217e-06,
"loss": 0.6398,
"mean_token_accuracy": 0.8172471389174462,
"step": 150
},
{
"epoch": 0.1244355243351731,
"grad_norm": 0.5528353725419236,
"learning_rate": 6.224899598393575e-06,
"loss": 0.6453,
"mean_token_accuracy": 0.8162423759698868,
"step": 155
},
{
"epoch": 0.12844957350727546,
"grad_norm": 0.7274143611573796,
"learning_rate": 6.42570281124498e-06,
"loss": 0.6538,
"mean_token_accuracy": 0.8140816584229469,
"step": 160
},
{
"epoch": 0.13246362267937783,
"grad_norm": 0.602479326169129,
"learning_rate": 6.626506024096386e-06,
"loss": 0.6388,
"mean_token_accuracy": 0.8172610536217689,
"step": 165
},
{
"epoch": 0.1364776718514802,
"grad_norm": 0.59001493891811,
"learning_rate": 6.8273092369477925e-06,
"loss": 0.6386,
"mean_token_accuracy": 0.8174594342708588,
"step": 170
},
{
"epoch": 0.14049172102358254,
"grad_norm": 0.643343148654434,
"learning_rate": 7.028112449799197e-06,
"loss": 0.6449,
"mean_token_accuracy": 0.8159894704818725,
"step": 175
},
{
"epoch": 0.1445057701956849,
"grad_norm": 0.6086401788605312,
"learning_rate": 7.228915662650603e-06,
"loss": 0.619,
"mean_token_accuracy": 0.821616081893444,
"step": 180
},
{
"epoch": 0.14851981936778724,
"grad_norm": 0.6081562590281072,
"learning_rate": 7.429718875502009e-06,
"loss": 0.6338,
"mean_token_accuracy": 0.8180027529597282,
"step": 185
},
{
"epoch": 0.15253386853988962,
"grad_norm": 0.5631488889183262,
"learning_rate": 7.630522088353415e-06,
"loss": 0.6206,
"mean_token_accuracy": 0.8213231518864632,
"step": 190
},
{
"epoch": 0.15654791771199197,
"grad_norm": 0.6623884065817287,
"learning_rate": 7.83132530120482e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.8204338252544403,
"step": 195
},
{
"epoch": 0.16056196688409433,
"grad_norm": 0.6674297683399895,
"learning_rate": 8.032128514056226e-06,
"loss": 0.6312,
"mean_token_accuracy": 0.8186420142650604,
"step": 200
},
{
"epoch": 0.16457601605619668,
"grad_norm": 0.704186654303425,
"learning_rate": 8.232931726907631e-06,
"loss": 0.6266,
"mean_token_accuracy": 0.8195266082882882,
"step": 205
},
{
"epoch": 0.16859006522829906,
"grad_norm": 0.6473103907382632,
"learning_rate": 8.433734939759038e-06,
"loss": 0.6287,
"mean_token_accuracy": 0.819443441927433,
"step": 210
},
{
"epoch": 0.1726041144004014,
"grad_norm": 0.5992551573092145,
"learning_rate": 8.634538152610442e-06,
"loss": 0.6141,
"mean_token_accuracy": 0.8228801786899567,
"step": 215
},
{
"epoch": 0.17661816357250376,
"grad_norm": 0.607794767167812,
"learning_rate": 8.835341365461849e-06,
"loss": 0.6233,
"mean_token_accuracy": 0.8208266854286194,
"step": 220
},
{
"epoch": 0.1806322127446061,
"grad_norm": 0.6251260607888597,
"learning_rate": 9.036144578313254e-06,
"loss": 0.6186,
"mean_token_accuracy": 0.8219277203083039,
"step": 225
},
{
"epoch": 0.1846462619167085,
"grad_norm": 0.7752879519685737,
"learning_rate": 9.23694779116466e-06,
"loss": 0.616,
"mean_token_accuracy": 0.8226246342062951,
"step": 230
},
{
"epoch": 0.18866031108881084,
"grad_norm": 0.6885986830946352,
"learning_rate": 9.437751004016065e-06,
"loss": 0.6162,
"mean_token_accuracy": 0.82130526304245,
"step": 235
},
{
"epoch": 0.1926743602609132,
"grad_norm": 0.6299815927660661,
"learning_rate": 9.63855421686747e-06,
"loss": 0.6173,
"mean_token_accuracy": 0.8219423845410347,
"step": 240
},
{
"epoch": 0.19668840943301555,
"grad_norm": 0.7583480330442449,
"learning_rate": 9.839357429718876e-06,
"loss": 0.6148,
"mean_token_accuracy": 0.8225555747747422,
"step": 245
},
{
"epoch": 0.2007024586051179,
"grad_norm": 0.7143206609736337,
"learning_rate": 9.99999508689586e-06,
"loss": 0.6154,
"mean_token_accuracy": 0.8219420880079269,
"step": 250
},
{
"epoch": 0.20471650777722028,
"grad_norm": 0.6907038048664699,
"learning_rate": 9.999823129264712e-06,
"loss": 0.6108,
"mean_token_accuracy": 0.8229139536619187,
"step": 255
},
{
"epoch": 0.20873055694932263,
"grad_norm": 0.6559142117068314,
"learning_rate": 9.999405526081825e-06,
"loss": 0.6189,
"mean_token_accuracy": 0.8209213152527809,
"step": 260
},
{
"epoch": 0.21274460612142498,
"grad_norm": 0.6659512333420378,
"learning_rate": 9.998742297864394e-06,
"loss": 0.6035,
"mean_token_accuracy": 0.8244217514991761,
"step": 265
},
{
"epoch": 0.21675865529352734,
"grad_norm": 0.5947661763992849,
"learning_rate": 9.997833477197386e-06,
"loss": 0.5997,
"mean_token_accuracy": 0.8259585857391357,
"step": 270
},
{
"epoch": 0.22077270446562972,
"grad_norm": 0.8438008201731584,
"learning_rate": 9.99667910873193e-06,
"loss": 0.6078,
"mean_token_accuracy": 0.8240197777748108,
"step": 275
},
{
"epoch": 0.22478675363773207,
"grad_norm": 0.8713006693349223,
"learning_rate": 9.99527924918313e-06,
"loss": 0.6016,
"mean_token_accuracy": 0.8250452890992165,
"step": 280
},
{
"epoch": 0.22880080280983442,
"grad_norm": 0.7120303554563648,
"learning_rate": 9.99363396732727e-06,
"loss": 0.6052,
"mean_token_accuracy": 0.8246865943074226,
"step": 285
},
{
"epoch": 0.23281485198193677,
"grad_norm": 0.6101967868649579,
"learning_rate": 9.991743343998446e-06,
"loss": 0.5928,
"mean_token_accuracy": 0.8269972503185272,
"step": 290
},
{
"epoch": 0.23682890115403912,
"grad_norm": 0.6881046614257558,
"learning_rate": 9.989607472084583e-06,
"loss": 0.5997,
"mean_token_accuracy": 0.8250862330198288,
"step": 295
},
{
"epoch": 0.2408429503261415,
"grad_norm": 0.6112394055545181,
"learning_rate": 9.987226456522884e-06,
"loss": 0.5915,
"mean_token_accuracy": 0.8273163467645646,
"step": 300
},
{
"epoch": 0.24485699949824385,
"grad_norm": 0.6277850468291706,
"learning_rate": 9.98460041429466e-06,
"loss": 0.5903,
"mean_token_accuracy": 0.8281888499855995,
"step": 305
},
{
"epoch": 0.2488710486703462,
"grad_norm": 0.7241042442848334,
"learning_rate": 9.981729474419595e-06,
"loss": 0.6041,
"mean_token_accuracy": 0.8239907890558242,
"step": 310
},
{
"epoch": 0.25288509784244856,
"grad_norm": 0.6241491569251894,
"learning_rate": 9.978613777949401e-06,
"loss": 0.5964,
"mean_token_accuracy": 0.8260207697749138,
"step": 315
},
{
"epoch": 0.2568991470145509,
"grad_norm": 0.6370033368365354,
"learning_rate": 9.975253477960887e-06,
"loss": 0.5917,
"mean_token_accuracy": 0.8270863309502602,
"step": 320
},
{
"epoch": 0.26091319618665326,
"grad_norm": 0.638196228956886,
"learning_rate": 9.971648739548443e-06,
"loss": 0.5955,
"mean_token_accuracy": 0.8271015107631683,
"step": 325
},
{
"epoch": 0.26492724535875567,
"grad_norm": 0.5924987938379438,
"learning_rate": 9.967799739815925e-06,
"loss": 0.5953,
"mean_token_accuracy": 0.8263336911797523,
"step": 330
},
{
"epoch": 0.268941294530858,
"grad_norm": 0.6085311126983912,
"learning_rate": 9.963706667867956e-06,
"loss": 0.5963,
"mean_token_accuracy": 0.8259957909584046,
"step": 335
},
{
"epoch": 0.2729553437029604,
"grad_norm": 0.739333912621636,
"learning_rate": 9.95936972480063e-06,
"loss": 0.5904,
"mean_token_accuracy": 0.8273468598723411,
"step": 340
},
{
"epoch": 0.2769693928750627,
"grad_norm": 0.7146326321442452,
"learning_rate": 9.954789123691643e-06,
"loss": 0.5826,
"mean_token_accuracy": 0.829837815463543,
"step": 345
},
{
"epoch": 0.2809834420471651,
"grad_norm": 0.6734576231685178,
"learning_rate": 9.94996508958981e-06,
"loss": 0.5887,
"mean_token_accuracy": 0.828106701374054,
"step": 350
},
{
"epoch": 0.28499749121926743,
"grad_norm": 0.6525370797613959,
"learning_rate": 9.944897859504022e-06,
"loss": 0.5749,
"mean_token_accuracy": 0.8314093336462974,
"step": 355
},
{
"epoch": 0.2890115403913698,
"grad_norm": 0.656220700779885,
"learning_rate": 9.939587682391587e-06,
"loss": 0.5881,
"mean_token_accuracy": 0.8278362900018692,
"step": 360
},
{
"epoch": 0.29302558956347213,
"grad_norm": 0.8291093196925574,
"learning_rate": 9.934034819146015e-06,
"loss": 0.5857,
"mean_token_accuracy": 0.8292678326368332,
"step": 365
},
{
"epoch": 0.2970396387355745,
"grad_norm": 0.75313947093166,
"learning_rate": 9.928239542584186e-06,
"loss": 0.581,
"mean_token_accuracy": 0.8295959487557412,
"step": 370
},
{
"epoch": 0.3010536879076769,
"grad_norm": 0.6701377011807581,
"learning_rate": 9.922202137432954e-06,
"loss": 0.5774,
"mean_token_accuracy": 0.8301370680332184,
"step": 375
},
{
"epoch": 0.30506773707977924,
"grad_norm": 0.6752117129825771,
"learning_rate": 9.915922900315158e-06,
"loss": 0.5862,
"mean_token_accuracy": 0.828272470831871,
"step": 380
},
{
"epoch": 0.3090817862518816,
"grad_norm": 0.6462301731086534,
"learning_rate": 9.90940213973504e-06,
"loss": 0.5872,
"mean_token_accuracy": 0.8279532685875892,
"step": 385
},
{
"epoch": 0.31309583542398395,
"grad_norm": 0.6180558660475647,
"learning_rate": 9.902640176063103e-06,
"loss": 0.5839,
"mean_token_accuracy": 0.8296105518937111,
"step": 390
},
{
"epoch": 0.3171098845960863,
"grad_norm": 0.7334201323221599,
"learning_rate": 9.895637341520357e-06,
"loss": 0.5914,
"mean_token_accuracy": 0.8273193582892417,
"step": 395
},
{
"epoch": 0.32112393376818865,
"grad_norm": 0.6244699500533561,
"learning_rate": 9.888393980162e-06,
"loss": 0.5894,
"mean_token_accuracy": 0.8276168584823609,
"step": 400
},
{
"epoch": 0.325137982940291,
"grad_norm": 0.5890996107094697,
"learning_rate": 9.880910447860527e-06,
"loss": 0.5714,
"mean_token_accuracy": 0.8319997638463974,
"step": 405
},
{
"epoch": 0.32915203211239336,
"grad_norm": 0.6067071145439632,
"learning_rate": 9.873187112288224e-06,
"loss": 0.5776,
"mean_token_accuracy": 0.8308833613991737,
"step": 410
},
{
"epoch": 0.3331660812844957,
"grad_norm": 0.599604126460933,
"learning_rate": 9.86522435289912e-06,
"loss": 0.591,
"mean_token_accuracy": 0.8270509079098701,
"step": 415
},
{
"epoch": 0.3371801304565981,
"grad_norm": 0.6817769908050123,
"learning_rate": 9.857022560910338e-06,
"loss": 0.5834,
"mean_token_accuracy": 0.8284465298056602,
"step": 420
},
{
"epoch": 0.34119417962870047,
"grad_norm": 0.718771722804361,
"learning_rate": 9.848582139282879e-06,
"loss": 0.5806,
"mean_token_accuracy": 0.8296876326203346,
"step": 425
},
{
"epoch": 0.3452082288008028,
"grad_norm": 0.751747783066705,
"learning_rate": 9.839903502701815e-06,
"loss": 0.5866,
"mean_token_accuracy": 0.8281015455722809,
"step": 430
},
{
"epoch": 0.34922227797290517,
"grad_norm": 0.6843260226103427,
"learning_rate": 9.830987077555925e-06,
"loss": 0.5708,
"mean_token_accuracy": 0.8315734773874283,
"step": 435
},
{
"epoch": 0.3532363271450075,
"grad_norm": 0.609990923675998,
"learning_rate": 9.821833301916737e-06,
"loss": 0.5744,
"mean_token_accuracy": 0.8311674281954765,
"step": 440
},
{
"epoch": 0.3572503763171099,
"grad_norm": 0.6541236549277216,
"learning_rate": 9.812442625517017e-06,
"loss": 0.5759,
"mean_token_accuracy": 0.8305603489279747,
"step": 445
},
{
"epoch": 0.3612644254892122,
"grad_norm": 0.6333659668052299,
"learning_rate": 9.802815509728662e-06,
"loss": 0.5663,
"mean_token_accuracy": 0.8331687957048416,
"step": 450
},
{
"epoch": 0.3652784746613146,
"grad_norm": 0.6543354382780389,
"learning_rate": 9.792952427540037e-06,
"loss": 0.57,
"mean_token_accuracy": 0.8322424262762069,
"step": 455
},
{
"epoch": 0.369292523833417,
"grad_norm": 0.5994204037638772,
"learning_rate": 9.782853863532736e-06,
"loss": 0.5806,
"mean_token_accuracy": 0.8293602868914605,
"step": 460
},
{
"epoch": 0.37330657300551934,
"grad_norm": 0.6554085321235406,
"learning_rate": 9.772520313857777e-06,
"loss": 0.5664,
"mean_token_accuracy": 0.8329186499118805,
"step": 465
},
{
"epoch": 0.3773206221776217,
"grad_norm": 0.6642085090735961,
"learning_rate": 9.761952286211221e-06,
"loss": 0.5726,
"mean_token_accuracy": 0.8313711732625961,
"step": 470
},
{
"epoch": 0.38133467134972404,
"grad_norm": 0.6831480368586637,
"learning_rate": 9.75115029980923e-06,
"loss": 0.5719,
"mean_token_accuracy": 0.8317721590399743,
"step": 475
},
{
"epoch": 0.3853487205218264,
"grad_norm": 0.5545902036926461,
"learning_rate": 9.740114885362562e-06,
"loss": 0.5739,
"mean_token_accuracy": 0.8307827830314636,
"step": 480
},
{
"epoch": 0.38936276969392875,
"grad_norm": 0.6045330433361745,
"learning_rate": 9.728846585050486e-06,
"loss": 0.5609,
"mean_token_accuracy": 0.8340664029121398,
"step": 485
},
{
"epoch": 0.3933768188660311,
"grad_norm": 0.5622152800616955,
"learning_rate": 9.717345952494162e-06,
"loss": 0.5638,
"mean_token_accuracy": 0.8336855262517929,
"step": 490
},
{
"epoch": 0.39739086803813345,
"grad_norm": 0.6341361309278204,
"learning_rate": 9.705613552729416e-06,
"loss": 0.5778,
"mean_token_accuracy": 0.8299095824360847,
"step": 495
},
{
"epoch": 0.4014049172102358,
"grad_norm": 0.6805767537720608,
"learning_rate": 9.693649962179006e-06,
"loss": 0.5659,
"mean_token_accuracy": 0.8327050372958184,
"step": 500
},
{
"epoch": 0.4054189663823382,
"grad_norm": 0.5836257323922431,
"learning_rate": 9.681455768624284e-06,
"loss": 0.5778,
"mean_token_accuracy": 0.8303996577858925,
"step": 505
},
{
"epoch": 0.40943301555444056,
"grad_norm": 0.6427318323333274,
"learning_rate": 9.669031571176322e-06,
"loss": 0.5615,
"mean_token_accuracy": 0.8343263894319535,
"step": 510
},
{
"epoch": 0.4134470647265429,
"grad_norm": 0.742563712570686,
"learning_rate": 9.656377980246483e-06,
"loss": 0.562,
"mean_token_accuracy": 0.8343602031469345,
"step": 515
},
{
"epoch": 0.41746111389864526,
"grad_norm": 0.590467419516716,
"learning_rate": 9.64349561751642e-06,
"loss": 0.565,
"mean_token_accuracy": 0.8327525511384011,
"step": 520
},
{
"epoch": 0.4214751630707476,
"grad_norm": 0.6312451978096704,
"learning_rate": 9.630385115907545e-06,
"loss": 0.5705,
"mean_token_accuracy": 0.8321760416030883,
"step": 525
},
{
"epoch": 0.42548921224284997,
"grad_norm": 0.598557343394431,
"learning_rate": 9.617047119549925e-06,
"loss": 0.5737,
"mean_token_accuracy": 0.831158398091793,
"step": 530
},
{
"epoch": 0.4295032614149523,
"grad_norm": 0.5787052037585553,
"learning_rate": 9.603482283750631e-06,
"loss": 0.5722,
"mean_token_accuracy": 0.8312353953719139,
"step": 535
},
{
"epoch": 0.43351731058705467,
"grad_norm": 0.6288500273743972,
"learning_rate": 9.589691274961556e-06,
"loss": 0.5721,
"mean_token_accuracy": 0.8316881075501442,
"step": 540
},
{
"epoch": 0.437531359759157,
"grad_norm": 0.6590854367714448,
"learning_rate": 9.57567477074666e-06,
"loss": 0.5728,
"mean_token_accuracy": 0.8310303285717964,
"step": 545
},
{
"epoch": 0.44154540893125943,
"grad_norm": 0.7631123718341468,
"learning_rate": 9.561433459748687e-06,
"loss": 0.5723,
"mean_token_accuracy": 0.8313180610537529,
"step": 550
},
{
"epoch": 0.4455594581033618,
"grad_norm": 0.7591778684239452,
"learning_rate": 9.546968041655326e-06,
"loss": 0.5573,
"mean_token_accuracy": 0.8353345975279808,
"step": 555
},
{
"epoch": 0.44957350727546413,
"grad_norm": 0.6624232933704659,
"learning_rate": 9.53227922716484e-06,
"loss": 0.565,
"mean_token_accuracy": 0.8330870345234871,
"step": 560
},
{
"epoch": 0.4535875564475665,
"grad_norm": 0.5786385616805684,
"learning_rate": 9.517367737951144e-06,
"loss": 0.5692,
"mean_token_accuracy": 0.8318209871649742,
"step": 565
},
{
"epoch": 0.45760160561966884,
"grad_norm": 0.690131830320271,
"learning_rate": 9.502234306628354e-06,
"loss": 0.5693,
"mean_token_accuracy": 0.8317399948835373,
"step": 570
},
{
"epoch": 0.4616156547917712,
"grad_norm": 0.7134142378990983,
"learning_rate": 9.48687967671479e-06,
"loss": 0.5609,
"mean_token_accuracy": 0.8342039838433266,
"step": 575
},
{
"epoch": 0.46562970396387354,
"grad_norm": 0.7400587759918655,
"learning_rate": 9.471304602596441e-06,
"loss": 0.5628,
"mean_token_accuracy": 0.8338278353214263,
"step": 580
},
{
"epoch": 0.4696437531359759,
"grad_norm": 0.6806903212075592,
"learning_rate": 9.455509849489915e-06,
"loss": 0.5633,
"mean_token_accuracy": 0.8332584217190743,
"step": 585
},
{
"epoch": 0.47365780230807825,
"grad_norm": 0.761594191887626,
"learning_rate": 9.43949619340483e-06,
"loss": 0.564,
"mean_token_accuracy": 0.8335172370076179,
"step": 590
},
{
"epoch": 0.47767185148018065,
"grad_norm": 0.5988325912168402,
"learning_rate": 9.42326442110569e-06,
"loss": 0.5721,
"mean_token_accuracy": 0.8312305808067322,
"step": 595
},
{
"epoch": 0.481685900652283,
"grad_norm": 0.6255751606480096,
"learning_rate": 9.406815330073244e-06,
"loss": 0.569,
"mean_token_accuracy": 0.8319594085216522,
"step": 600
},
{
"epoch": 0.48569994982438536,
"grad_norm": 0.5996297218515124,
"learning_rate": 9.390149728465285e-06,
"loss": 0.553,
"mean_token_accuracy": 0.8362368091940879,
"step": 605
},
{
"epoch": 0.4897139989964877,
"grad_norm": 0.6918796234724647,
"learning_rate": 9.373268435076959e-06,
"loss": 0.5575,
"mean_token_accuracy": 0.835071188211441,
"step": 610
},
{
"epoch": 0.49372804816859006,
"grad_norm": 0.7009608255947237,
"learning_rate": 9.356172279300528e-06,
"loss": 0.5575,
"mean_token_accuracy": 0.8341048300266266,
"step": 615
},
{
"epoch": 0.4977420973406924,
"grad_norm": 0.6275504392991532,
"learning_rate": 9.338862101084631e-06,
"loss": 0.5636,
"mean_token_accuracy": 0.8333185657858848,
"step": 620
},
{
"epoch": 0.5017561465127948,
"grad_norm": 0.5534731372491267,
"learning_rate": 9.321338750893008e-06,
"loss": 0.5683,
"mean_token_accuracy": 0.8324558317661286,
"step": 625
},
{
"epoch": 0.5057701956848971,
"grad_norm": 0.5603065647360427,
"learning_rate": 9.303603089662717e-06,
"loss": 0.5504,
"mean_token_accuracy": 0.8368956163525582,
"step": 630
},
{
"epoch": 0.5097842448569995,
"grad_norm": 0.6053004197413049,
"learning_rate": 9.285655988761839e-06,
"loss": 0.5499,
"mean_token_accuracy": 0.8363432809710503,
"step": 635
},
{
"epoch": 0.5137982940291018,
"grad_norm": 0.5801686134608613,
"learning_rate": 9.267498329946669e-06,
"loss": 0.5653,
"mean_token_accuracy": 0.8325617238879204,
"step": 640
},
{
"epoch": 0.5178123432012042,
"grad_norm": 0.6071815287621978,
"learning_rate": 9.249131005318388e-06,
"loss": 0.5544,
"mean_token_accuracy": 0.8360264331102372,
"step": 645
},
{
"epoch": 0.5218263923733065,
"grad_norm": 0.6554755596069916,
"learning_rate": 9.230554917279233e-06,
"loss": 0.5581,
"mean_token_accuracy": 0.8350590914487839,
"step": 650
},
{
"epoch": 0.5258404415454089,
"grad_norm": 0.6069071364502077,
"learning_rate": 9.211770978488171e-06,
"loss": 0.5627,
"mean_token_accuracy": 0.833693404495716,
"step": 655
},
{
"epoch": 0.5298544907175113,
"grad_norm": 0.688309769409275,
"learning_rate": 9.192780111816048e-06,
"loss": 0.5632,
"mean_token_accuracy": 0.8338836416602134,
"step": 660
},
{
"epoch": 0.5338685398896137,
"grad_norm": 0.6276359304847686,
"learning_rate": 9.173583250300253e-06,
"loss": 0.5624,
"mean_token_accuracy": 0.8330780416727066,
"step": 665
},
{
"epoch": 0.537882589061716,
"grad_norm": 0.6332615236946093,
"learning_rate": 9.154181337098878e-06,
"loss": 0.552,
"mean_token_accuracy": 0.8359251782298088,
"step": 670
},
{
"epoch": 0.5418966382338184,
"grad_norm": 0.5861055869183572,
"learning_rate": 9.134575325444377e-06,
"loss": 0.5423,
"mean_token_accuracy": 0.8382502719759941,
"step": 675
},
{
"epoch": 0.5459106874059207,
"grad_norm": 0.6171709061249796,
"learning_rate": 9.114766178596734e-06,
"loss": 0.5558,
"mean_token_accuracy": 0.8346521988511085,
"step": 680
},
{
"epoch": 0.5499247365780231,
"grad_norm": 0.5654000660210794,
"learning_rate": 9.09475486979614e-06,
"loss": 0.5528,
"mean_token_accuracy": 0.8365279525518418,
"step": 685
},
{
"epoch": 0.5539387857501255,
"grad_norm": 0.6466821559302002,
"learning_rate": 9.07454238221517e-06,
"loss": 0.5602,
"mean_token_accuracy": 0.8333901852369309,
"step": 690
},
{
"epoch": 0.5579528349222278,
"grad_norm": 0.6175016838311507,
"learning_rate": 9.054129708910486e-06,
"loss": 0.5657,
"mean_token_accuracy": 0.8327378645539284,
"step": 695
},
{
"epoch": 0.5619668840943302,
"grad_norm": 0.5700614104338453,
"learning_rate": 9.033517852774046e-06,
"loss": 0.5501,
"mean_token_accuracy": 0.8366321474313736,
"step": 700
},
{
"epoch": 0.5659809332664325,
"grad_norm": 0.5863271468585884,
"learning_rate": 9.012707826483823e-06,
"loss": 0.5468,
"mean_token_accuracy": 0.8372207880020142,
"step": 705
},
{
"epoch": 0.5699949824385349,
"grad_norm": 0.6369292028334194,
"learning_rate": 8.991700652454066e-06,
"loss": 0.5619,
"mean_token_accuracy": 0.8337433338165283,
"step": 710
},
{
"epoch": 0.5740090316106372,
"grad_norm": 0.5387359333690185,
"learning_rate": 8.970497362785052e-06,
"loss": 0.5579,
"mean_token_accuracy": 0.8344336777925492,
"step": 715
},
{
"epoch": 0.5780230807827396,
"grad_norm": 0.6587993755610946,
"learning_rate": 8.94909899921239e-06,
"loss": 0.5542,
"mean_token_accuracy": 0.8355127662420273,
"step": 720
},
{
"epoch": 0.5820371299548419,
"grad_norm": 0.6401226301007903,
"learning_rate": 8.927506613055839e-06,
"loss": 0.5497,
"mean_token_accuracy": 0.8362575441598892,
"step": 725
},
{
"epoch": 0.5860511791269443,
"grad_norm": 0.6804051477273255,
"learning_rate": 8.905721265167644e-06,
"loss": 0.5399,
"mean_token_accuracy": 0.8386722207069397,
"step": 730
},
{
"epoch": 0.5900652282990466,
"grad_norm": 0.5593126661424827,
"learning_rate": 8.883744025880429e-06,
"loss": 0.5492,
"mean_token_accuracy": 0.8365695863962174,
"step": 735
},
{
"epoch": 0.594079277471149,
"grad_norm": 0.7158554231921177,
"learning_rate": 8.861575974954602e-06,
"loss": 0.5498,
"mean_token_accuracy": 0.8374354973435402,
"step": 740
},
{
"epoch": 0.5980933266432514,
"grad_norm": 0.7747230270377562,
"learning_rate": 8.839218201525312e-06,
"loss": 0.5601,
"mean_token_accuracy": 0.8338992148637772,
"step": 745
},
{
"epoch": 0.6021073758153538,
"grad_norm": 0.6309209085971893,
"learning_rate": 8.816671804048933e-06,
"loss": 0.5496,
"mean_token_accuracy": 0.8368017837405205,
"step": 750
},
{
"epoch": 0.6061214249874561,
"grad_norm": 0.6666014610071999,
"learning_rate": 8.7939378902491e-06,
"loss": 0.5642,
"mean_token_accuracy": 0.8328350514173508,
"step": 755
},
{
"epoch": 0.6101354741595585,
"grad_norm": 0.6202628898930358,
"learning_rate": 8.771017577062282e-06,
"loss": 0.5455,
"mean_token_accuracy": 0.8375746294856071,
"step": 760
},
{
"epoch": 0.6141495233316608,
"grad_norm": 0.6066663449696902,
"learning_rate": 8.747911990582912e-06,
"loss": 0.5542,
"mean_token_accuracy": 0.8355247572064399,
"step": 765
},
{
"epoch": 0.6181635725037632,
"grad_norm": 0.6146156971775205,
"learning_rate": 8.724622266008054e-06,
"loss": 0.5586,
"mean_token_accuracy": 0.834241335093975,
"step": 770
},
{
"epoch": 0.6221776216758655,
"grad_norm": 0.5872550377785694,
"learning_rate": 8.701149547581631e-06,
"loss": 0.5482,
"mean_token_accuracy": 0.8369159802794457,
"step": 775
},
{
"epoch": 0.6261916708479679,
"grad_norm": 0.5902160677997114,
"learning_rate": 8.67749498853821e-06,
"loss": 0.5394,
"mean_token_accuracy": 0.8388776108622551,
"step": 780
},
{
"epoch": 0.6302057200200702,
"grad_norm": 0.5672836827289786,
"learning_rate": 8.65365975104635e-06,
"loss": 0.5531,
"mean_token_accuracy": 0.8354972064495086,
"step": 785
},
{
"epoch": 0.6342197691921726,
"grad_norm": 0.6235670038077569,
"learning_rate": 8.629645006151483e-06,
"loss": 0.5527,
"mean_token_accuracy": 0.8354064971208572,
"step": 790
},
{
"epoch": 0.638233818364275,
"grad_norm": 0.6716148045956016,
"learning_rate": 8.6054519337184e-06,
"loss": 0.5672,
"mean_token_accuracy": 0.8319687351584435,
"step": 795
},
{
"epoch": 0.6422478675363773,
"grad_norm": 0.6872493021964662,
"learning_rate": 8.58108172237327e-06,
"loss": 0.5423,
"mean_token_accuracy": 0.8382853552699089,
"step": 800
},
{
"epoch": 0.6462619167084797,
"grad_norm": 0.567572405722234,
"learning_rate": 8.556535569445252e-06,
"loss": 0.5584,
"mean_token_accuracy": 0.8342087417840958,
"step": 805
},
{
"epoch": 0.650275965880582,
"grad_norm": 0.6202297374631588,
"learning_rate": 8.531814680907664e-06,
"loss": 0.5494,
"mean_token_accuracy": 0.8363103911280632,
"step": 810
},
{
"epoch": 0.6542900150526844,
"grad_norm": 0.7149267451711114,
"learning_rate": 8.506920271318729e-06,
"loss": 0.5526,
"mean_token_accuracy": 0.835868252813816,
"step": 815
},
{
"epoch": 0.6583040642247867,
"grad_norm": 0.5912849173956192,
"learning_rate": 8.481853563761906e-06,
"loss": 0.5453,
"mean_token_accuracy": 0.8372933536767959,
"step": 820
},
{
"epoch": 0.6623181133968891,
"grad_norm": 0.6637136617026432,
"learning_rate": 8.456615789785804e-06,
"loss": 0.5422,
"mean_token_accuracy": 0.8382136434316635,
"step": 825
},
{
"epoch": 0.6663321625689914,
"grad_norm": 0.6995566851869169,
"learning_rate": 8.43120818934367e-06,
"loss": 0.555,
"mean_token_accuracy": 0.8347884580492974,
"step": 830
},
{
"epoch": 0.6703462117410939,
"grad_norm": 0.6602263363301184,
"learning_rate": 8.405632010732462e-06,
"loss": 0.548,
"mean_token_accuracy": 0.8366368874907494,
"step": 835
},
{
"epoch": 0.6743602609131962,
"grad_norm": 0.6796507475619034,
"learning_rate": 8.379888510531536e-06,
"loss": 0.5418,
"mean_token_accuracy": 0.8378365620970726,
"step": 840
},
{
"epoch": 0.6783743100852986,
"grad_norm": 0.6973054982700846,
"learning_rate": 8.353978953540893e-06,
"loss": 0.5467,
"mean_token_accuracy": 0.8372493907809258,
"step": 845
},
{
"epoch": 0.6823883592574009,
"grad_norm": 0.6536264569620208,
"learning_rate": 8.32790461271905e-06,
"loss": 0.5418,
"mean_token_accuracy": 0.8384019210934639,
"step": 850
},
{
"epoch": 0.6864024084295033,
"grad_norm": 0.546683652398305,
"learning_rate": 8.301666769120488e-06,
"loss": 0.5437,
"mean_token_accuracy": 0.8375723645091057,
"step": 855
},
{
"epoch": 0.6904164576016056,
"grad_norm": 0.7963584379304186,
"learning_rate": 8.275266711832722e-06,
"loss": 0.5504,
"mean_token_accuracy": 0.8361119583249093,
"step": 860
},
{
"epoch": 0.694430506773708,
"grad_norm": 0.6129739181303979,
"learning_rate": 8.24870573791296e-06,
"loss": 0.5587,
"mean_token_accuracy": 0.8341295495629311,
"step": 865
},
{
"epoch": 0.6984445559458103,
"grad_norm": 0.6035540038334497,
"learning_rate": 8.221985152324385e-06,
"loss": 0.5438,
"mean_token_accuracy": 0.8377896025776863,
"step": 870
},
{
"epoch": 0.7024586051179127,
"grad_norm": 0.5464317718279572,
"learning_rate": 8.195106267872035e-06,
"loss": 0.5308,
"mean_token_accuracy": 0.8412103086709977,
"step": 875
},
{
"epoch": 0.706472654290015,
"grad_norm": 0.5880182013926408,
"learning_rate": 8.168070405138303e-06,
"loss": 0.5411,
"mean_token_accuracy": 0.8386243000626564,
"step": 880
},
{
"epoch": 0.7104867034621174,
"grad_norm": 0.70991046912602,
"learning_rate": 8.14087889241806e-06,
"loss": 0.5431,
"mean_token_accuracy": 0.8383191004395485,
"step": 885
},
{
"epoch": 0.7145007526342197,
"grad_norm": 0.6562145842410761,
"learning_rate": 8.113533065653395e-06,
"loss": 0.5423,
"mean_token_accuracy": 0.8379684969782829,
"step": 890
},
{
"epoch": 0.7185148018063221,
"grad_norm": 0.6480124383110402,
"learning_rate": 8.086034268367971e-06,
"loss": 0.5422,
"mean_token_accuracy": 0.8380523830652237,
"step": 895
},
{
"epoch": 0.7225288509784245,
"grad_norm": 0.6292142248495385,
"learning_rate": 8.058383851601027e-06,
"loss": 0.5408,
"mean_token_accuracy": 0.8389136686921119,
"step": 900
},
{
"epoch": 0.7265429001505268,
"grad_norm": 0.5820959494127994,
"learning_rate": 8.030583173840997e-06,
"loss": 0.5388,
"mean_token_accuracy": 0.8387292832136154,
"step": 905
},
{
"epoch": 0.7305569493226292,
"grad_norm": 0.6250625002124468,
"learning_rate": 8.002633600958762e-06,
"loss": 0.5468,
"mean_token_accuracy": 0.8370036602020263,
"step": 910
},
{
"epoch": 0.7345709984947315,
"grad_norm": 0.672514782103767,
"learning_rate": 7.974536506140546e-06,
"loss": 0.5441,
"mean_token_accuracy": 0.8380858764052391,
"step": 915
},
{
"epoch": 0.738585047666834,
"grad_norm": 0.590876941682476,
"learning_rate": 7.946293269820456e-06,
"loss": 0.546,
"mean_token_accuracy": 0.8371559247374535,
"step": 920
},
{
"epoch": 0.7425990968389363,
"grad_norm": 0.6118643146843981,
"learning_rate": 7.917905279612648e-06,
"loss": 0.5325,
"mean_token_accuracy": 0.840439823269844,
"step": 925
},
{
"epoch": 0.7466131460110387,
"grad_norm": 0.7336641277164324,
"learning_rate": 7.889373930243166e-06,
"loss": 0.5485,
"mean_token_accuracy": 0.8366749197244644,
"step": 930
},
{
"epoch": 0.750627195183141,
"grad_norm": 0.7152604521519405,
"learning_rate": 7.860700623481404e-06,
"loss": 0.5427,
"mean_token_accuracy": 0.8377209782600403,
"step": 935
},
{
"epoch": 0.7546412443552434,
"grad_norm": 0.6776957187532802,
"learning_rate": 7.831886768071249e-06,
"loss": 0.5362,
"mean_token_accuracy": 0.8392210811376571,
"step": 940
},
{
"epoch": 0.7586552935273457,
"grad_norm": 0.770986244124321,
"learning_rate": 7.80293377966186e-06,
"loss": 0.5458,
"mean_token_accuracy": 0.8375400707125664,
"step": 945
},
{
"epoch": 0.7626693426994481,
"grad_norm": 0.6825578099616549,
"learning_rate": 7.77384308073812e-06,
"loss": 0.5453,
"mean_token_accuracy": 0.8378095313906669,
"step": 950
},
{
"epoch": 0.7666833918715504,
"grad_norm": 5.294879176126245,
"learning_rate": 7.744616100550743e-06,
"loss": 0.5467,
"mean_token_accuracy": 0.8369431406259537,
"step": 955
},
{
"epoch": 0.7706974410436528,
"grad_norm": 0.8415837092408757,
"learning_rate": 7.715254275046062e-06,
"loss": 0.5452,
"mean_token_accuracy": 0.837572930753231,
"step": 960
},
{
"epoch": 0.7747114902157551,
"grad_norm": 0.5986581958942131,
"learning_rate": 7.68575904679547e-06,
"loss": 0.5391,
"mean_token_accuracy": 0.8389428481459618,
"step": 965
},
{
"epoch": 0.7787255393878575,
"grad_norm": 0.5612943684118845,
"learning_rate": 7.65613186492455e-06,
"loss": 0.5309,
"mean_token_accuracy": 0.8410877391695977,
"step": 970
},
{
"epoch": 0.7827395885599598,
"grad_norm": 0.5892948923721739,
"learning_rate": 7.626374185041887e-06,
"loss": 0.5339,
"mean_token_accuracy": 0.8404828563332558,
"step": 975
},
{
"epoch": 0.7867536377320622,
"grad_norm": 0.614935197747847,
"learning_rate": 7.596487469167531e-06,
"loss": 0.5417,
"mean_token_accuracy": 0.8379798352718353,
"step": 980
},
{
"epoch": 0.7907676869041645,
"grad_norm": 0.5572327756360332,
"learning_rate": 7.566473185661187e-06,
"loss": 0.5379,
"mean_token_accuracy": 0.839057058095932,
"step": 985
},
{
"epoch": 0.7947817360762669,
"grad_norm": 0.5918529196648193,
"learning_rate": 7.536332809150066e-06,
"loss": 0.5387,
"mean_token_accuracy": 0.8385102912783623,
"step": 990
},
{
"epoch": 0.7987957852483693,
"grad_norm": 0.5961572996178651,
"learning_rate": 7.506067820456438e-06,
"loss": 0.5458,
"mean_token_accuracy": 0.8371727049350739,
"step": 995
},
{
"epoch": 0.8028098344204716,
"grad_norm": 0.6305016848759029,
"learning_rate": 7.475679706524864e-06,
"loss": 0.5357,
"mean_token_accuracy": 0.8398390769958496,
"step": 1000
},
{
"epoch": 0.806823883592574,
"grad_norm": 0.5798737021411499,
"learning_rate": 7.445169960349167e-06,
"loss": 0.5332,
"mean_token_accuracy": 0.8406742095947266,
"step": 1005
},
{
"epoch": 0.8108379327646764,
"grad_norm": 0.6561676520637244,
"learning_rate": 7.414540080899056e-06,
"loss": 0.5469,
"mean_token_accuracy": 0.8368082106113434,
"step": 1010
},
{
"epoch": 0.8148519819367788,
"grad_norm": 0.5915371150391379,
"learning_rate": 7.3837915730464896e-06,
"loss": 0.5371,
"mean_token_accuracy": 0.8387147217988968,
"step": 1015
},
{
"epoch": 0.8188660311088811,
"grad_norm": 0.5335073385354651,
"learning_rate": 7.3529259474917455e-06,
"loss": 0.5358,
"mean_token_accuracy": 0.8394276514649391,
"step": 1020
},
{
"epoch": 0.8228800802809835,
"grad_norm": 0.5386012526091869,
"learning_rate": 7.321944720689191e-06,
"loss": 0.5366,
"mean_token_accuracy": 0.8397987276315689,
"step": 1025
},
{
"epoch": 0.8268941294530858,
"grad_norm": 0.5223449430699659,
"learning_rate": 7.290849414772779e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.8395177885890007,
"step": 1030
},
{
"epoch": 0.8309081786251882,
"grad_norm": 0.5706841180761181,
"learning_rate": 7.2596415574812695e-06,
"loss": 0.5403,
"mean_token_accuracy": 0.8387762665748596,
"step": 1035
},
{
"epoch": 0.8349222277972905,
"grad_norm": 0.6290586727279687,
"learning_rate": 7.228322682083164e-06,
"loss": 0.5351,
"mean_token_accuracy": 0.8395211502909661,
"step": 1040
},
{
"epoch": 0.8389362769693929,
"grad_norm": 0.5277748021387862,
"learning_rate": 7.196894327301378e-06,
"loss": 0.5295,
"mean_token_accuracy": 0.8414138451218605,
"step": 1045
},
{
"epoch": 0.8429503261414952,
"grad_norm": 0.5539533785866126,
"learning_rate": 7.165358037237644e-06,
"loss": 0.5388,
"mean_token_accuracy": 0.8386986523866653,
"step": 1050
},
{
"epoch": 0.8469643753135976,
"grad_norm": 0.5062193058912297,
"learning_rate": 7.1337153612966455e-06,
"loss": 0.5349,
"mean_token_accuracy": 0.8398931756615639,
"step": 1055
},
{
"epoch": 0.8509784244856999,
"grad_norm": 0.5622943196472996,
"learning_rate": 7.1019678541098945e-06,
"loss": 0.5378,
"mean_token_accuracy": 0.8387416958808899,
"step": 1060
},
{
"epoch": 0.8549924736578023,
"grad_norm": 0.5718749819810406,
"learning_rate": 7.0701170754593516e-06,
"loss": 0.54,
"mean_token_accuracy": 0.8385371461510658,
"step": 1065
},
{
"epoch": 0.8590065228299046,
"grad_norm": 0.6435557666099391,
"learning_rate": 7.038164590200789e-06,
"loss": 0.5282,
"mean_token_accuracy": 0.8415184810757637,
"step": 1070
},
{
"epoch": 0.863020572002007,
"grad_norm": 0.6743895521038166,
"learning_rate": 7.006111968186914e-06,
"loss": 0.5332,
"mean_token_accuracy": 0.8404285505414009,
"step": 1075
},
{
"epoch": 0.8670346211741093,
"grad_norm": 0.6869985118812036,
"learning_rate": 6.9739607841902365e-06,
"loss": 0.538,
"mean_token_accuracy": 0.8389334738254547,
"step": 1080
},
{
"epoch": 0.8710486703462117,
"grad_norm": 0.62070604390802,
"learning_rate": 6.941712617825701e-06,
"loss": 0.5351,
"mean_token_accuracy": 0.8399768278002739,
"step": 1085
},
{
"epoch": 0.875062719518314,
"grad_norm": 0.5747195866768225,
"learning_rate": 6.909369053473079e-06,
"loss": 0.5428,
"mean_token_accuracy": 0.8378928422927856,
"step": 1090
},
{
"epoch": 0.8790767686904165,
"grad_norm": 0.5877150605799464,
"learning_rate": 6.876931680199121e-06,
"loss": 0.5298,
"mean_token_accuracy": 0.8409986332058906,
"step": 1095
},
{
"epoch": 0.8830908178625189,
"grad_norm": 0.598163324475379,
"learning_rate": 6.844402091679494e-06,
"loss": 0.5359,
"mean_token_accuracy": 0.8396167665719986,
"step": 1100
},
{
"epoch": 0.8871048670346212,
"grad_norm": 0.5988514234545925,
"learning_rate": 6.811781886120479e-06,
"loss": 0.5416,
"mean_token_accuracy": 0.8384165868163109,
"step": 1105
},
{
"epoch": 0.8911189162067236,
"grad_norm": 0.5253771613266209,
"learning_rate": 6.779072666180447e-06,
"loss": 0.5381,
"mean_token_accuracy": 0.8387709230184555,
"step": 1110
},
{
"epoch": 0.8951329653788259,
"grad_norm": 0.6133443221994367,
"learning_rate": 6.746276038891117e-06,
"loss": 0.54,
"mean_token_accuracy": 0.839115546643734,
"step": 1115
},
{
"epoch": 0.8991470145509283,
"grad_norm": 0.5596525836914499,
"learning_rate": 6.713393615578616e-06,
"loss": 0.5378,
"mean_token_accuracy": 0.8392528027296067,
"step": 1120
},
{
"epoch": 0.9031610637230306,
"grad_norm": 0.5821761444332735,
"learning_rate": 6.680427011784292e-06,
"loss": 0.537,
"mean_token_accuracy": 0.839189724624157,
"step": 1125
},
{
"epoch": 0.907175112895133,
"grad_norm": 0.894955820302947,
"learning_rate": 6.6473778471853536e-06,
"loss": 0.5359,
"mean_token_accuracy": 0.8388691842556,
"step": 1130
},
{
"epoch": 0.9111891620672353,
"grad_norm": 0.5737563610744022,
"learning_rate": 6.614247745515298e-06,
"loss": 0.5423,
"mean_token_accuracy": 0.838544836640358,
"step": 1135
},
{
"epoch": 0.9152032112393377,
"grad_norm": 0.5692294184392518,
"learning_rate": 6.58103833448412e-06,
"loss": 0.5407,
"mean_token_accuracy": 0.8381945803761482,
"step": 1140
},
{
"epoch": 0.91921726041144,
"grad_norm": 0.5742117002922255,
"learning_rate": 6.5477512456983595e-06,
"loss": 0.5291,
"mean_token_accuracy": 0.8412258476018906,
"step": 1145
},
{
"epoch": 0.9232313095835424,
"grad_norm": 0.5517555661869946,
"learning_rate": 6.514388114580924e-06,
"loss": 0.5309,
"mean_token_accuracy": 0.8410136729478837,
"step": 1150
},
{
"epoch": 0.9272453587556447,
"grad_norm": 0.6314576791547585,
"learning_rate": 6.480950580290751e-06,
"loss": 0.5398,
"mean_token_accuracy": 0.8388326406478882,
"step": 1155
},
{
"epoch": 0.9312594079277471,
"grad_norm": 0.5349229116155992,
"learning_rate": 6.44744028564226e-06,
"loss": 0.5403,
"mean_token_accuracy": 0.8383991166949272,
"step": 1160
},
{
"epoch": 0.9352734570998494,
"grad_norm": 0.5457988264371361,
"learning_rate": 6.413858877024659e-06,
"loss": 0.5385,
"mean_token_accuracy": 0.8390275910496712,
"step": 1165
},
{
"epoch": 0.9392875062719518,
"grad_norm": 0.6345026890776209,
"learning_rate": 6.380208004321037e-06,
"loss": 0.5266,
"mean_token_accuracy": 0.8420624524354935,
"step": 1170
},
{
"epoch": 0.9433015554440541,
"grad_norm": 0.5897108286808963,
"learning_rate": 6.34648932082732e-06,
"loss": 0.5298,
"mean_token_accuracy": 0.8406174406409264,
"step": 1175
},
{
"epoch": 0.9473156046161565,
"grad_norm": 0.5868359598483316,
"learning_rate": 6.312704483171029e-06,
"loss": 0.53,
"mean_token_accuracy": 0.8407926484942436,
"step": 1180
},
{
"epoch": 0.951329653788259,
"grad_norm": 0.6313928647086123,
"learning_rate": 6.2788551512299014e-06,
"loss": 0.531,
"mean_token_accuracy": 0.8403575956821442,
"step": 1185
},
{
"epoch": 0.9553437029603613,
"grad_norm": 0.5798903955533093,
"learning_rate": 6.244942988050325e-06,
"loss": 0.5468,
"mean_token_accuracy": 0.8370823442935944,
"step": 1190
},
{
"epoch": 0.9593577521324637,
"grad_norm": 0.5349118049394295,
"learning_rate": 6.210969659765651e-06,
"loss": 0.5327,
"mean_token_accuracy": 0.8406338363885879,
"step": 1195
},
{
"epoch": 0.963371801304566,
"grad_norm": 0.4950155751846596,
"learning_rate": 6.1769368355143125e-06,
"loss": 0.5205,
"mean_token_accuracy": 0.8438980832695961,
"step": 1200
},
{
"epoch": 0.9673858504766684,
"grad_norm": 0.5323860455486669,
"learning_rate": 6.142846187357839e-06,
"loss": 0.5313,
"mean_token_accuracy": 0.840233464539051,
"step": 1205
},
{
"epoch": 0.9713998996487707,
"grad_norm": 0.4817617225243705,
"learning_rate": 6.108699390198691e-06,
"loss": 0.5335,
"mean_token_accuracy": 0.8402068182826042,
"step": 1210
},
{
"epoch": 0.9754139488208731,
"grad_norm": 0.4995196546081424,
"learning_rate": 6.074498121697983e-06,
"loss": 0.5251,
"mean_token_accuracy": 0.8414423301815986,
"step": 1215
},
{
"epoch": 0.9794279979929754,
"grad_norm": 0.5750567202600121,
"learning_rate": 6.04024406219305e-06,
"loss": 0.5319,
"mean_token_accuracy": 0.8403345927596092,
"step": 1220
},
{
"epoch": 0.9834420471650778,
"grad_norm": 0.554410393503299,
"learning_rate": 6.0059388946148885e-06,
"loss": 0.5362,
"mean_token_accuracy": 0.8393355578184127,
"step": 1225
},
{
"epoch": 0.9874560963371801,
"grad_norm": 0.5547067674499261,
"learning_rate": 5.971584304405489e-06,
"loss": 0.5333,
"mean_token_accuracy": 0.8406757906079292,
"step": 1230
},
{
"epoch": 0.9914701455092825,
"grad_norm": 0.541496404401069,
"learning_rate": 5.937181979435007e-06,
"loss": 0.5402,
"mean_token_accuracy": 0.8378434419631958,
"step": 1235
},
{
"epoch": 0.9954841946813848,
"grad_norm": 0.565488616851699,
"learning_rate": 5.902733609918857e-06,
"loss": 0.5421,
"mean_token_accuracy": 0.8383191093802452,
"step": 1240
},
{
"epoch": 0.9994982438534872,
"grad_norm": 0.5830669897046137,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.5341,
"mean_token_accuracy": 0.8402512192726135,
"step": 1245
},
{
"epoch": 1.0040140491721024,
"grad_norm": 0.5255806082753902,
"learning_rate": 5.833705509339067e-06,
"loss": 0.5977,
"mean_token_accuracy": 0.8489638639659416,
"step": 1250
},
{
"epoch": 1.0080280983442047,
"grad_norm": 0.61241581013064,
"learning_rate": 5.799129169684566e-06,
"loss": 0.4891,
"mean_token_accuracy": 0.851288877427578,
"step": 1255
},
{
"epoch": 1.012042147516307,
"grad_norm": 0.6700704727742163,
"learning_rate": 5.76451356813605e-06,
"loss": 0.4878,
"mean_token_accuracy": 0.8516981139779091,
"step": 1260
},
{
"epoch": 1.0160561966884094,
"grad_norm": 0.5456983971902244,
"learning_rate": 5.729860405387384e-06,
"loss": 0.4854,
"mean_token_accuracy": 0.8516273483633995,
"step": 1265
},
{
"epoch": 1.0200702458605118,
"grad_norm": 0.6175710890585946,
"learning_rate": 5.6951713839778565e-06,
"loss": 0.4833,
"mean_token_accuracy": 0.8524551376700401,
"step": 1270
},
{
"epoch": 1.0240842950326141,
"grad_norm": 0.5253091539774541,
"learning_rate": 5.660448208208513e-06,
"loss": 0.4838,
"mean_token_accuracy": 0.8522418588399887,
"step": 1275
},
{
"epoch": 1.0280983442047165,
"grad_norm": 0.5012462654206351,
"learning_rate": 5.625692584058434e-06,
"loss": 0.4823,
"mean_token_accuracy": 0.8529530748724937,
"step": 1280
},
{
"epoch": 1.0321123933768188,
"grad_norm": 0.5592121551569235,
"learning_rate": 5.590906219100919e-06,
"loss": 0.4905,
"mean_token_accuracy": 0.8506887316703796,
"step": 1285
},
{
"epoch": 1.0361264425489212,
"grad_norm": 0.5153674677293302,
"learning_rate": 5.556090822419589e-06,
"loss": 0.4881,
"mean_token_accuracy": 0.8507951095700264,
"step": 1290
},
{
"epoch": 1.0401404917210235,
"grad_norm": 0.6088145024835736,
"learning_rate": 5.521248104524415e-06,
"loss": 0.4784,
"mean_token_accuracy": 0.8535375446081161,
"step": 1295
},
{
"epoch": 1.0441545408931259,
"grad_norm": 0.4664157690543253,
"learning_rate": 5.4863797772676865e-06,
"loss": 0.4781,
"mean_token_accuracy": 0.8540792852640152,
"step": 1300
},
{
"epoch": 1.0481685900652282,
"grad_norm": 0.5354927183536671,
"learning_rate": 5.451487553759899e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.8520985931158066,
"step": 1305
},
{
"epoch": 1.0521826392373306,
"grad_norm": 0.5411262184931188,
"learning_rate": 5.416573148285594e-06,
"loss": 0.4812,
"mean_token_accuracy": 0.8527259394526482,
"step": 1310
},
{
"epoch": 1.056196688409433,
"grad_norm": 0.5828680936268568,
"learning_rate": 5.3816382762191314e-06,
"loss": 0.4872,
"mean_token_accuracy": 0.8516100868582726,
"step": 1315
},
{
"epoch": 1.0602107375815353,
"grad_norm": 0.5202254772193751,
"learning_rate": 5.346684653940408e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.8536487385630608,
"step": 1320
},
{
"epoch": 1.0642247867536376,
"grad_norm": 0.6388141647258224,
"learning_rate": 5.311713998750543e-06,
"loss": 0.4881,
"mean_token_accuracy": 0.85139652043581,
"step": 1325
},
{
"epoch": 1.0682388359257402,
"grad_norm": 0.5465015590788906,
"learning_rate": 5.276728028787489e-06,
"loss": 0.4821,
"mean_token_accuracy": 0.8529908075928688,
"step": 1330
},
{
"epoch": 1.0722528850978423,
"grad_norm": 0.5594552749543369,
"learning_rate": 5.24172846294163e-06,
"loss": 0.4847,
"mean_token_accuracy": 0.8517061173915863,
"step": 1335
},
{
"epoch": 1.076266934269945,
"grad_norm": 0.6110305995724576,
"learning_rate": 5.206717020771323e-06,
"loss": 0.4923,
"mean_token_accuracy": 0.8506526678800583,
"step": 1340
},
{
"epoch": 1.0802809834420473,
"grad_norm": 0.5684290645772769,
"learning_rate": 5.171695422418429e-06,
"loss": 0.4832,
"mean_token_accuracy": 0.8523881688714028,
"step": 1345
},
{
"epoch": 1.0842950326141496,
"grad_norm": 0.623945297590145,
"learning_rate": 5.136665388523779e-06,
"loss": 0.4796,
"mean_token_accuracy": 0.8534211605787277,
"step": 1350
},
{
"epoch": 1.088309081786252,
"grad_norm": 0.784850879982968,
"learning_rate": 5.101628640142655e-06,
"loss": 0.4794,
"mean_token_accuracy": 0.8535040110349655,
"step": 1355
},
{
"epoch": 1.0923231309583543,
"grad_norm": 0.47623171391541286,
"learning_rate": 5.06658689866023e-06,
"loss": 0.4817,
"mean_token_accuracy": 0.8525002762675286,
"step": 1360
},
{
"epoch": 1.0963371801304567,
"grad_norm": 0.5042352606147674,
"learning_rate": 5.031541885706987e-06,
"loss": 0.4813,
"mean_token_accuracy": 0.8532689422369003,
"step": 1365
},
{
"epoch": 1.100351229302559,
"grad_norm": 0.5016426811840553,
"learning_rate": 4.99649532307414e-06,
"loss": 0.4865,
"mean_token_accuracy": 0.8514423102140427,
"step": 1370
},
{
"epoch": 1.1043652784746614,
"grad_norm": 0.5268578218681798,
"learning_rate": 4.961448932629047e-06,
"loss": 0.4842,
"mean_token_accuracy": 0.8524499326944351,
"step": 1375
},
{
"epoch": 1.1083793276467637,
"grad_norm": 0.5144802084081512,
"learning_rate": 4.926404436230596e-06,
"loss": 0.4831,
"mean_token_accuracy": 0.8526805534958839,
"step": 1380
},
{
"epoch": 1.112393376818866,
"grad_norm": 0.5062678557635162,
"learning_rate": 4.891363555644623e-06,
"loss": 0.4768,
"mean_token_accuracy": 0.8536328047513961,
"step": 1385
},
{
"epoch": 1.1164074259909684,
"grad_norm": 0.5784487909262853,
"learning_rate": 4.8563280124593205e-06,
"loss": 0.4837,
"mean_token_accuracy": 0.8526077657938004,
"step": 1390
},
{
"epoch": 1.1204214751630708,
"grad_norm": 0.5599564503109443,
"learning_rate": 4.821299528000643e-06,
"loss": 0.4764,
"mean_token_accuracy": 0.8544846564531327,
"step": 1395
},
{
"epoch": 1.1244355243351731,
"grad_norm": 0.5051397425603605,
"learning_rate": 4.786279823247749e-06,
"loss": 0.4841,
"mean_token_accuracy": 0.8521718412637711,
"step": 1400
},
{
"epoch": 1.1284495735072755,
"grad_norm": 0.5564697170540107,
"learning_rate": 4.751270618748439e-06,
"loss": 0.482,
"mean_token_accuracy": 0.8527628138661385,
"step": 1405
},
{
"epoch": 1.1324636226793778,
"grad_norm": 0.5462012358781666,
"learning_rate": 4.71627363453463e-06,
"loss": 0.481,
"mean_token_accuracy": 0.8532051593065262,
"step": 1410
},
{
"epoch": 1.1364776718514802,
"grad_norm": 0.47939684892732765,
"learning_rate": 4.681290590037845e-06,
"loss": 0.4861,
"mean_token_accuracy": 0.8520785883069039,
"step": 1415
},
{
"epoch": 1.1404917210235825,
"grad_norm": 0.49652735975196016,
"learning_rate": 4.6463232040047355e-06,
"loss": 0.4749,
"mean_token_accuracy": 0.8546465337276459,
"step": 1420
},
{
"epoch": 1.144505770195685,
"grad_norm": 0.47650301494794856,
"learning_rate": 4.61137319441264e-06,
"loss": 0.488,
"mean_token_accuracy": 0.8511898010969162,
"step": 1425
},
{
"epoch": 1.1485198193677872,
"grad_norm": 0.49365022232137656,
"learning_rate": 4.57644227838518e-06,
"loss": 0.4773,
"mean_token_accuracy": 0.8540461644530296,
"step": 1430
},
{
"epoch": 1.1525338685398896,
"grad_norm": 0.5174719592435709,
"learning_rate": 4.541532172107891e-06,
"loss": 0.4764,
"mean_token_accuracy": 0.8547235488891601,
"step": 1435
},
{
"epoch": 1.156547917711992,
"grad_norm": 0.6030063953889359,
"learning_rate": 4.5066445907439104e-06,
"loss": 0.4846,
"mean_token_accuracy": 0.8522387713193893,
"step": 1440
},
{
"epoch": 1.1605619668840943,
"grad_norm": 0.48832132466767514,
"learning_rate": 4.471781248349702e-06,
"loss": 0.4868,
"mean_token_accuracy": 0.8517817571759224,
"step": 1445
},
{
"epoch": 1.1645760160561967,
"grad_norm": 0.5530963471918582,
"learning_rate": 4.436943857790859e-06,
"loss": 0.4882,
"mean_token_accuracy": 0.8510265350341797,
"step": 1450
},
{
"epoch": 1.168590065228299,
"grad_norm": 0.5744529358259624,
"learning_rate": 4.402134130657925e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8536973863840103,
"step": 1455
},
{
"epoch": 1.1726041144004014,
"grad_norm": 1.0812219162154455,
"learning_rate": 4.367353777182332e-06,
"loss": 0.4876,
"mean_token_accuracy": 0.8515413969755172,
"step": 1460
},
{
"epoch": 1.1766181635725037,
"grad_norm": 0.505147254276408,
"learning_rate": 4.332604506152341e-06,
"loss": 0.4782,
"mean_token_accuracy": 0.8540340691804886,
"step": 1465
},
{
"epoch": 1.180632212744606,
"grad_norm": 0.5103240298779389,
"learning_rate": 4.297888024829126e-06,
"loss": 0.4785,
"mean_token_accuracy": 0.8533610329031944,
"step": 1470
},
{
"epoch": 1.1846462619167084,
"grad_norm": 0.5611328032960321,
"learning_rate": 4.263206038862858e-06,
"loss": 0.4885,
"mean_token_accuracy": 0.8507640421390533,
"step": 1475
},
{
"epoch": 1.1886603110888108,
"grad_norm": 0.5136330037040837,
"learning_rate": 4.22856025220893e-06,
"loss": 0.4896,
"mean_token_accuracy": 0.8505268082022667,
"step": 1480
},
{
"epoch": 1.1926743602609131,
"grad_norm": 0.4853833862909341,
"learning_rate": 4.193952367044232e-06,
"loss": 0.4815,
"mean_token_accuracy": 0.8532075121998787,
"step": 1485
},
{
"epoch": 1.1966884094330155,
"grad_norm": 0.5147177386874451,
"learning_rate": 4.159384083683518e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.8535997048020363,
"step": 1490
},
{
"epoch": 1.2007024586051178,
"grad_norm": 0.5497574369895587,
"learning_rate": 4.124857100495877e-06,
"loss": 0.4856,
"mean_token_accuracy": 0.8514174923300744,
"step": 1495
},
{
"epoch": 1.2047165077772202,
"grad_norm": 0.6226254194200767,
"learning_rate": 4.090373113821281e-06,
"loss": 0.4817,
"mean_token_accuracy": 0.8529708757996559,
"step": 1500
},
{
"epoch": 1.2087305569493227,
"grad_norm": 0.5488518920357796,
"learning_rate": 4.055933817887247e-06,
"loss": 0.4842,
"mean_token_accuracy": 0.8528445586562157,
"step": 1505
},
{
"epoch": 1.2127446061214249,
"grad_norm": 0.5221887780865238,
"learning_rate": 4.021540904725603e-06,
"loss": 0.4826,
"mean_token_accuracy": 0.8525567546486854,
"step": 1510
},
{
"epoch": 1.2167586552935274,
"grad_norm": 0.5594564612440402,
"learning_rate": 3.987196064089346e-06,
"loss": 0.4769,
"mean_token_accuracy": 0.8545236945152282,
"step": 1515
},
{
"epoch": 1.2207727044656298,
"grad_norm": 0.5255224366679334,
"learning_rate": 3.952900983369632e-06,
"loss": 0.487,
"mean_token_accuracy": 0.8506386041641235,
"step": 1520
},
{
"epoch": 1.2247867536377322,
"grad_norm": 0.5726647439659638,
"learning_rate": 3.91865734751287e-06,
"loss": 0.4806,
"mean_token_accuracy": 0.8534023508429527,
"step": 1525
},
{
"epoch": 1.2288008028098345,
"grad_norm": 0.5624813503686409,
"learning_rate": 3.88446683893794e-06,
"loss": 0.482,
"mean_token_accuracy": 0.8528756931424141,
"step": 1530
},
{
"epoch": 1.2328148519819369,
"grad_norm": 0.5195532566310461,
"learning_rate": 3.850331137453529e-06,
"loss": 0.4819,
"mean_token_accuracy": 0.8532358273863793,
"step": 1535
},
{
"epoch": 1.2368289011540392,
"grad_norm": 0.5472089753849174,
"learning_rate": 3.816251920175611e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.8497111722826958,
"step": 1540
},
{
"epoch": 1.2408429503261416,
"grad_norm": 0.5368326431573441,
"learning_rate": 3.782230861445041e-06,
"loss": 0.4739,
"mean_token_accuracy": 0.8550259992480278,
"step": 1545
},
{
"epoch": 1.244856999498244,
"grad_norm": 0.4915065349616282,
"learning_rate": 3.7482696327452926e-06,
"loss": 0.4827,
"mean_token_accuracy": 0.8524277821183205,
"step": 1550
},
{
"epoch": 1.2488710486703463,
"grad_norm": 0.48495072086168006,
"learning_rate": 3.714369902620345e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8536085486412048,
"step": 1555
},
{
"epoch": 1.2528850978424486,
"grad_norm": 1.750136693379065,
"learning_rate": 3.6805333365926943e-06,
"loss": 0.4874,
"mean_token_accuracy": 0.8515821918845177,
"step": 1560
},
{
"epoch": 1.256899147014551,
"grad_norm": 0.5041608097022385,
"learning_rate": 3.6467615970815323e-06,
"loss": 0.4837,
"mean_token_accuracy": 0.8521517217159271,
"step": 1565
},
{
"epoch": 1.2609131961866533,
"grad_norm": 0.5630876506506443,
"learning_rate": 3.613056343321073e-06,
"loss": 0.4676,
"mean_token_accuracy": 0.8564149275422096,
"step": 1570
},
{
"epoch": 1.2649272453587557,
"grad_norm": 0.5419703298529472,
"learning_rate": 3.579419231279023e-06,
"loss": 0.4746,
"mean_token_accuracy": 0.8546606913208962,
"step": 1575
},
{
"epoch": 1.268941294530858,
"grad_norm": 0.5500842339907406,
"learning_rate": 3.5458519135752346e-06,
"loss": 0.4786,
"mean_token_accuracy": 0.8540479198098183,
"step": 1580
},
{
"epoch": 1.2729553437029604,
"grad_norm": 0.5574331160594922,
"learning_rate": 3.5123560394005004e-06,
"loss": 0.4877,
"mean_token_accuracy": 0.8515752986073494,
"step": 1585
},
{
"epoch": 1.2769693928750627,
"grad_norm": 0.5251115350284153,
"learning_rate": 3.478933254435534e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8529331281781196,
"step": 1590
},
{
"epoch": 1.280983442047165,
"grad_norm": 0.5718874948859872,
"learning_rate": 3.4455852007701154e-06,
"loss": 0.4824,
"mean_token_accuracy": 0.8529073163866997,
"step": 1595
},
{
"epoch": 1.2849974912192674,
"grad_norm": 0.5236497682255199,
"learning_rate": 3.4123135168224053e-06,
"loss": 0.4842,
"mean_token_accuracy": 0.8519692197442055,
"step": 1600
},
{
"epoch": 1.2890115403913698,
"grad_norm": 0.4884629705033181,
"learning_rate": 3.3791198372584664e-06,
"loss": 0.4781,
"mean_token_accuracy": 0.8534619972109795,
"step": 1605
},
{
"epoch": 1.2930255895634721,
"grad_norm": 0.5303681824959834,
"learning_rate": 3.3460057929119306e-06,
"loss": 0.4868,
"mean_token_accuracy": 0.850970309972763,
"step": 1610
},
{
"epoch": 1.2970396387355745,
"grad_norm": 0.5123281384081424,
"learning_rate": 3.3129730107038916e-06,
"loss": 0.4903,
"mean_token_accuracy": 0.8506717085838318,
"step": 1615
},
{
"epoch": 1.3010536879076768,
"grad_norm": 0.4899717227978676,
"learning_rate": 3.280023113562957e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.8520964190363884,
"step": 1620
},
{
"epoch": 1.3050677370797792,
"grad_norm": 0.5052149939852724,
"learning_rate": 3.2471577203455263e-06,
"loss": 0.4807,
"mean_token_accuracy": 0.8533585995435715,
"step": 1625
},
{
"epoch": 1.3090817862518815,
"grad_norm": 0.5364809069727472,
"learning_rate": 3.21437844575625e-06,
"loss": 0.4783,
"mean_token_accuracy": 0.8539668470621109,
"step": 1630
},
{
"epoch": 1.313095835423984,
"grad_norm": 0.49451274386147803,
"learning_rate": 3.181686900268694e-06,
"loss": 0.4717,
"mean_token_accuracy": 0.8559211567044258,
"step": 1635
},
{
"epoch": 1.3171098845960862,
"grad_norm": 0.4754946294939683,
"learning_rate": 3.149084690046221e-06,
"loss": 0.478,
"mean_token_accuracy": 0.8539295867085457,
"step": 1640
},
{
"epoch": 1.3211239337681886,
"grad_norm": 0.4807750565396974,
"learning_rate": 3.1165734168630767e-06,
"loss": 0.4703,
"mean_token_accuracy": 0.8557563424110413,
"step": 1645
},
{
"epoch": 1.325137982940291,
"grad_norm": 0.4783048679547648,
"learning_rate": 3.084154678025692e-06,
"loss": 0.4736,
"mean_token_accuracy": 0.8553013518452645,
"step": 1650
},
{
"epoch": 1.3291520321123933,
"grad_norm": 0.4870454651116592,
"learning_rate": 3.051830066294207e-06,
"loss": 0.4728,
"mean_token_accuracy": 0.8551506981253624,
"step": 1655
},
{
"epoch": 1.3331660812844957,
"grad_norm": 0.4992080659924365,
"learning_rate": 3.019601169804216e-06,
"loss": 0.4799,
"mean_token_accuracy": 0.8541036993265152,
"step": 1660
},
{
"epoch": 1.337180130456598,
"grad_norm": 0.4796718782317687,
"learning_rate": 2.9874695719887463e-06,
"loss": 0.4737,
"mean_token_accuracy": 0.8544951483607293,
"step": 1665
},
{
"epoch": 1.3411941796287006,
"grad_norm": 0.4747418088055164,
"learning_rate": 2.955436851500454e-06,
"loss": 0.4718,
"mean_token_accuracy": 0.8550894737243653,
"step": 1670
},
{
"epoch": 1.3452082288008027,
"grad_norm": 0.4740811703629381,
"learning_rate": 2.9235045821340713e-06,
"loss": 0.4775,
"mean_token_accuracy": 0.8539848864078522,
"step": 1675
},
{
"epoch": 1.3492222779729053,
"grad_norm": 0.5040859930530716,
"learning_rate": 2.89167433274908e-06,
"loss": 0.4844,
"mean_token_accuracy": 0.8522534683346749,
"step": 1680
},
{
"epoch": 1.3532363271450074,
"grad_norm": 0.4685574013809817,
"learning_rate": 2.859947667192636e-06,
"loss": 0.4675,
"mean_token_accuracy": 0.8567684695124627,
"step": 1685
},
{
"epoch": 1.35725037631711,
"grad_norm": 0.5414825961816714,
"learning_rate": 2.8283261442227303e-06,
"loss": 0.4835,
"mean_token_accuracy": 0.8527524784207344,
"step": 1690
},
{
"epoch": 1.3612644254892121,
"grad_norm": 0.4767368636637838,
"learning_rate": 2.7968113174316102e-06,
"loss": 0.4843,
"mean_token_accuracy": 0.8519262120127677,
"step": 1695
},
{
"epoch": 1.3652784746613147,
"grad_norm": 0.4762349732624997,
"learning_rate": 2.765404735169454e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.8536876887083054,
"step": 1700
},
{
"epoch": 1.369292523833417,
"grad_norm": 0.4603866793711708,
"learning_rate": 2.7341079404682887e-06,
"loss": 0.483,
"mean_token_accuracy": 0.8522417232394218,
"step": 1705
},
{
"epoch": 1.3733065730055194,
"grad_norm": 0.48783583675642933,
"learning_rate": 2.702922470966187e-06,
"loss": 0.4861,
"mean_token_accuracy": 0.8517315194010735,
"step": 1710
},
{
"epoch": 1.3773206221776217,
"grad_norm": 0.457565482089079,
"learning_rate": 2.671849858831721e-06,
"loss": 0.4781,
"mean_token_accuracy": 0.8536698654294014,
"step": 1715
},
{
"epoch": 1.381334671349724,
"grad_norm": 0.47742709258212396,
"learning_rate": 2.640891630688682e-06,
"loss": 0.4856,
"mean_token_accuracy": 0.8515735983848571,
"step": 1720
},
{
"epoch": 1.3853487205218264,
"grad_norm": 0.4519116845237833,
"learning_rate": 2.610049307541085e-06,
"loss": 0.4745,
"mean_token_accuracy": 0.8553285971283913,
"step": 1725
},
{
"epoch": 1.3893627696939288,
"grad_norm": 0.46002460375644066,
"learning_rate": 2.579324404698428e-06,
"loss": 0.4792,
"mean_token_accuracy": 0.8532739356160164,
"step": 1730
},
{
"epoch": 1.3933768188660312,
"grad_norm": 0.43230273487871207,
"learning_rate": 2.548718431701251e-06,
"loss": 0.4624,
"mean_token_accuracy": 0.8579189226031303,
"step": 1735
},
{
"epoch": 1.3973908680381335,
"grad_norm": 0.4784555211080267,
"learning_rate": 2.518232892246972e-06,
"loss": 0.4783,
"mean_token_accuracy": 0.8536991968750953,
"step": 1740
},
{
"epoch": 1.4014049172102359,
"grad_norm": 0.47787198660534125,
"learning_rate": 2.4878692841160053e-06,
"loss": 0.4762,
"mean_token_accuracy": 0.8540623039007187,
"step": 1745
},
{
"epoch": 1.4054189663823382,
"grad_norm": 0.47979677873474513,
"learning_rate": 2.4576290990981755e-06,
"loss": 0.4909,
"mean_token_accuracy": 0.8503037631511688,
"step": 1750
},
{
"epoch": 1.4094330155544406,
"grad_norm": 0.5095819745293857,
"learning_rate": 2.4275138229194238e-06,
"loss": 0.4759,
"mean_token_accuracy": 0.8540142044425011,
"step": 1755
},
{
"epoch": 1.413447064726543,
"grad_norm": 0.4805811343718235,
"learning_rate": 2.3975249351688207e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.8527746021747589,
"step": 1760
},
{
"epoch": 1.4174611138986453,
"grad_norm": 0.5969184626019208,
"learning_rate": 2.3676639092258584e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.852690675854683,
"step": 1765
},
{
"epoch": 1.4214751630707476,
"grad_norm": 0.4818521910414447,
"learning_rate": 2.337932212188073e-06,
"loss": 0.477,
"mean_token_accuracy": 0.8541664496064186,
"step": 1770
},
{
"epoch": 1.42548921224285,
"grad_norm": 0.465679128369887,
"learning_rate": 2.3083313047989626e-06,
"loss": 0.4772,
"mean_token_accuracy": 0.8541705653071403,
"step": 1775
},
{
"epoch": 1.4295032614149523,
"grad_norm": 0.47603113675595066,
"learning_rate": 2.278862641376215e-06,
"loss": 0.4718,
"mean_token_accuracy": 0.8553374916315079,
"step": 1780
},
{
"epoch": 1.4335173105870547,
"grad_norm": 0.4274170132490548,
"learning_rate": 2.2495276697402663e-06,
"loss": 0.4772,
"mean_token_accuracy": 0.8541041478514672,
"step": 1785
},
{
"epoch": 1.437531359759157,
"grad_norm": 0.4666292377645937,
"learning_rate": 2.2203278311431575e-06,
"loss": 0.4779,
"mean_token_accuracy": 0.8537328645586968,
"step": 1790
},
{
"epoch": 1.4415454089312594,
"grad_norm": 0.4537894496554442,
"learning_rate": 2.1912645601977283e-06,
"loss": 0.4686,
"mean_token_accuracy": 0.8557942762970925,
"step": 1795
},
{
"epoch": 1.4455594581033617,
"grad_norm": 0.475653896781378,
"learning_rate": 2.162339284807136e-06,
"loss": 0.4882,
"mean_token_accuracy": 0.8511700749397277,
"step": 1800
},
{
"epoch": 1.449573507275464,
"grad_norm": 0.4920511476816344,
"learning_rate": 2.1335534260946945e-06,
"loss": 0.488,
"mean_token_accuracy": 0.8513952940702438,
"step": 1805
},
{
"epoch": 1.4535875564475664,
"grad_norm": 0.4814427078654733,
"learning_rate": 2.104908398334069e-06,
"loss": 0.4785,
"mean_token_accuracy": 0.8537196487188339,
"step": 1810
},
{
"epoch": 1.4576016056196688,
"grad_norm": 0.48060183639390225,
"learning_rate": 2.0764056088797646e-06,
"loss": 0.4776,
"mean_token_accuracy": 0.8542910426855087,
"step": 1815
},
{
"epoch": 1.4616156547917711,
"grad_norm": 0.4632278881841595,
"learning_rate": 2.048046458098013e-06,
"loss": 0.4806,
"mean_token_accuracy": 0.8533736944198609,
"step": 1820
},
{
"epoch": 1.4656297039638735,
"grad_norm": 0.43866859419295146,
"learning_rate": 2.0198323392979453e-06,
"loss": 0.4734,
"mean_token_accuracy": 0.8550125047564506,
"step": 1825
},
{
"epoch": 1.4696437531359758,
"grad_norm": 0.5025943669083757,
"learning_rate": 1.9917646386631577e-06,
"loss": 0.4796,
"mean_token_accuracy": 0.8533670097589493,
"step": 1830
},
{
"epoch": 1.4736578023080782,
"grad_norm": 0.4544304942999498,
"learning_rate": 1.9638447351835875e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.8517355710268021,
"step": 1835
},
{
"epoch": 1.4776718514801805,
"grad_norm": 0.4499771859558144,
"learning_rate": 1.9360740005877774e-06,
"loss": 0.4785,
"mean_token_accuracy": 0.8536696940660476,
"step": 1840
},
{
"epoch": 1.4816859006522831,
"grad_norm": 0.4391506848423441,
"learning_rate": 1.908453799275479e-06,
"loss": 0.4751,
"mean_token_accuracy": 0.8547928795218468,
"step": 1845
},
{
"epoch": 1.4856999498243852,
"grad_norm": 0.4321233556378181,
"learning_rate": 1.8809854882506129e-06,
"loss": 0.4719,
"mean_token_accuracy": 0.8550894305109977,
"step": 1850
},
{
"epoch": 1.4897139989964878,
"grad_norm": 0.4545767091147806,
"learning_rate": 1.8536704170546005e-06,
"loss": 0.4792,
"mean_token_accuracy": 0.8536446437239646,
"step": 1855
},
{
"epoch": 1.49372804816859,
"grad_norm": 0.45768305317904345,
"learning_rate": 1.8265099277000614e-06,
"loss": 0.4675,
"mean_token_accuracy": 0.8566059991717339,
"step": 1860
},
{
"epoch": 1.4977420973406925,
"grad_norm": 0.583758976338618,
"learning_rate": 1.7995053546048762e-06,
"loss": 0.4861,
"mean_token_accuracy": 0.8516813203692436,
"step": 1865
},
{
"epoch": 1.5017561465127947,
"grad_norm": 0.5014344703500223,
"learning_rate": 1.7726580245266334e-06,
"loss": 0.482,
"mean_token_accuracy": 0.852832806110382,
"step": 1870
},
{
"epoch": 1.5057701956848972,
"grad_norm": 0.6997345848148291,
"learning_rate": 1.7459692564974317e-06,
"loss": 0.4854,
"mean_token_accuracy": 0.852383628487587,
"step": 1875
},
{
"epoch": 1.5097842448569994,
"grad_norm": 0.5154156720297645,
"learning_rate": 1.719440361759086e-06,
"loss": 0.4808,
"mean_token_accuracy": 0.8528477355837822,
"step": 1880
},
{
"epoch": 1.513798294029102,
"grad_norm": 0.4870849894345533,
"learning_rate": 1.6930726436986977e-06,
"loss": 0.4702,
"mean_token_accuracy": 0.8555424034595489,
"step": 1885
},
{
"epoch": 1.517812343201204,
"grad_norm": 0.4518757378910186,
"learning_rate": 1.6668673977846255e-06,
"loss": 0.4766,
"mean_token_accuracy": 0.8546996787190437,
"step": 1890
},
{
"epoch": 1.5218263923733066,
"grad_norm": 0.4576719732501448,
"learning_rate": 1.6408259115028325e-06,
"loss": 0.4676,
"mean_token_accuracy": 0.8562597304582595,
"step": 1895
},
{
"epoch": 1.5258404415454088,
"grad_norm": 0.4836723761371539,
"learning_rate": 1.6149494642936253e-06,
"loss": 0.4775,
"mean_token_accuracy": 0.8537423238158226,
"step": 1900
},
{
"epoch": 1.5298544907175113,
"grad_norm": 0.4386507907416896,
"learning_rate": 1.589239327488812e-06,
"loss": 0.4769,
"mean_token_accuracy": 0.8538577347993851,
"step": 1905
},
{
"epoch": 1.5338685398896137,
"grad_norm": 0.4688066892992193,
"learning_rate": 1.5636967642492196e-06,
"loss": 0.4691,
"mean_token_accuracy": 0.8562686622142792,
"step": 1910
},
{
"epoch": 1.537882589061716,
"grad_norm": 0.434007927437407,
"learning_rate": 1.538323029502654e-06,
"loss": 0.4806,
"mean_token_accuracy": 0.8535393372178077,
"step": 1915
},
{
"epoch": 1.5418966382338184,
"grad_norm": 0.43743732120688245,
"learning_rate": 1.5131193698822234e-06,
"loss": 0.4612,
"mean_token_accuracy": 0.8581413358449936,
"step": 1920
},
{
"epoch": 1.5459106874059207,
"grad_norm": 0.4569484082529831,
"learning_rate": 1.488087023665104e-06,
"loss": 0.4768,
"mean_token_accuracy": 0.8539406448602677,
"step": 1925
},
{
"epoch": 1.549924736578023,
"grad_norm": 0.4699068791196816,
"learning_rate": 1.463227220711706e-06,
"loss": 0.4735,
"mean_token_accuracy": 0.8544613897800446,
"step": 1930
},
{
"epoch": 1.5539387857501255,
"grad_norm": 0.424463650107424,
"learning_rate": 1.4385411824052343e-06,
"loss": 0.4781,
"mean_token_accuracy": 0.8539833888411522,
"step": 1935
},
{
"epoch": 1.5579528349222278,
"grad_norm": 0.46255145201724784,
"learning_rate": 1.414030121591692e-06,
"loss": 0.4729,
"mean_token_accuracy": 0.8552562475204468,
"step": 1940
},
{
"epoch": 1.5619668840943302,
"grad_norm": 0.46101782496953947,
"learning_rate": 1.3896952425202893e-06,
"loss": 0.4763,
"mean_token_accuracy": 0.8538477480411529,
"step": 1945
},
{
"epoch": 1.5659809332664325,
"grad_norm": 0.4710263307623761,
"learning_rate": 1.3655377407842813e-06,
"loss": 0.4785,
"mean_token_accuracy": 0.8540721848607064,
"step": 1950
},
{
"epoch": 1.5699949824385349,
"grad_norm": 0.45711760666640394,
"learning_rate": 1.3415588032622202e-06,
"loss": 0.4717,
"mean_token_accuracy": 0.8554088562726975,
"step": 1955
},
{
"epoch": 1.5740090316106372,
"grad_norm": 0.4456868518853785,
"learning_rate": 1.3177596080596467e-06,
"loss": 0.4701,
"mean_token_accuracy": 0.8558782458305358,
"step": 1960
},
{
"epoch": 1.5780230807827396,
"grad_norm": 0.4290784998523086,
"learning_rate": 1.2941413244512113e-06,
"loss": 0.4732,
"mean_token_accuracy": 0.8551268830895424,
"step": 1965
},
{
"epoch": 1.582037129954842,
"grad_norm": 0.45571698707191083,
"learning_rate": 1.2707051128232217e-06,
"loss": 0.478,
"mean_token_accuracy": 0.8536119505763053,
"step": 1970
},
{
"epoch": 1.5860511791269443,
"grad_norm": 0.4553754461355705,
"learning_rate": 1.2474521246166392e-06,
"loss": 0.4745,
"mean_token_accuracy": 0.8546977415680885,
"step": 1975
},
{
"epoch": 1.5900652282990466,
"grad_norm": 0.4394199878687813,
"learning_rate": 1.2243835022705003e-06,
"loss": 0.4761,
"mean_token_accuracy": 0.8550259307026863,
"step": 1980
},
{
"epoch": 1.594079277471149,
"grad_norm": 0.46568138230485706,
"learning_rate": 1.2015003791657854e-06,
"loss": 0.4672,
"mean_token_accuracy": 0.8566817179322243,
"step": 1985
},
{
"epoch": 1.5980933266432515,
"grad_norm": 0.43193045548782183,
"learning_rate": 1.1788038795697487e-06,
"loss": 0.4737,
"mean_token_accuracy": 0.8545981183648109,
"step": 1990
},
{
"epoch": 1.6021073758153537,
"grad_norm": 0.43777140121635394,
"learning_rate": 1.1562951185806675e-06,
"loss": 0.4758,
"mean_token_accuracy": 0.8548903733491897,
"step": 1995
},
{
"epoch": 1.6061214249874562,
"grad_norm": 0.5060321727440272,
"learning_rate": 1.1339752020730664e-06,
"loss": 0.4734,
"mean_token_accuracy": 0.8551399186253548,
"step": 2000
},
{
"epoch": 1.6101354741595584,
"grad_norm": 0.4923417367245005,
"learning_rate": 1.1118452266433732e-06,
"loss": 0.4777,
"mean_token_accuracy": 0.8535337939858436,
"step": 2005
},
{
"epoch": 1.614149523331661,
"grad_norm": 0.47803638007953236,
"learning_rate": 1.0899062795560572e-06,
"loss": 0.4848,
"mean_token_accuracy": 0.8526277154684067,
"step": 2010
},
{
"epoch": 1.618163572503763,
"grad_norm": 0.4643935744468831,
"learning_rate": 1.068159438690199e-06,
"loss": 0.4799,
"mean_token_accuracy": 0.8536258295178414,
"step": 2015
},
{
"epoch": 1.6221776216758657,
"grad_norm": 0.4467924882668494,
"learning_rate": 1.046605772486538e-06,
"loss": 0.4808,
"mean_token_accuracy": 0.8534541547298431,
"step": 2020
},
{
"epoch": 1.6261916708479678,
"grad_norm": 0.4842025238599929,
"learning_rate": 1.025246339894979e-06,
"loss": 0.4752,
"mean_token_accuracy": 0.8539585873484612,
"step": 2025
},
{
"epoch": 1.6302057200200704,
"grad_norm": 0.4867487448285858,
"learning_rate": 1.0040821903225633e-06,
"loss": 0.4701,
"mean_token_accuracy": 0.8561724841594696,
"step": 2030
},
{
"epoch": 1.6342197691921725,
"grad_norm": 0.4284828609172432,
"learning_rate": 9.831143635819162e-07,
"loss": 0.4766,
"mean_token_accuracy": 0.8540926545858383,
"step": 2035
},
{
"epoch": 1.638233818364275,
"grad_norm": 0.47071814520488964,
"learning_rate": 9.62343889840151e-07,
"loss": 0.4771,
"mean_token_accuracy": 0.8538994386792182,
"step": 2040
},
{
"epoch": 1.6422478675363772,
"grad_norm": 0.46031800716554394,
"learning_rate": 9.417717895682626e-07,
"loss": 0.4715,
"mean_token_accuracy": 0.8556769326329231,
"step": 2045
},
{
"epoch": 1.6462619167084798,
"grad_norm": 0.4823659021884061,
"learning_rate": 9.213990734909884e-07,
"loss": 0.4813,
"mean_token_accuracy": 0.8533883213996887,
"step": 2050
},
{
"epoch": 1.650275965880582,
"grad_norm": 0.4373578109896797,
"learning_rate": 9.012267425371513e-07,
"loss": 0.4809,
"mean_token_accuracy": 0.8532153591513634,
"step": 2055
},
{
"epoch": 1.6542900150526845,
"grad_norm": 0.4576132273763026,
"learning_rate": 8.812557877904848e-07,
"loss": 0.4754,
"mean_token_accuracy": 0.8547022685408592,
"step": 2060
},
{
"epoch": 1.6583040642247866,
"grad_norm": 0.43234177821278336,
"learning_rate": 8.614871904409372e-07,
"loss": 0.4623,
"mean_token_accuracy": 0.8578175500035286,
"step": 2065
},
{
"epoch": 1.6623181133968892,
"grad_norm": 0.46525027057902296,
"learning_rate": 8.419219217364654e-07,
"loss": 0.4757,
"mean_token_accuracy": 0.8548225834965706,
"step": 2070
},
{
"epoch": 1.6663321625689913,
"grad_norm": 0.4499223883161168,
"learning_rate": 8.225609429353187e-07,
"loss": 0.4753,
"mean_token_accuracy": 0.8543162420392036,
"step": 2075
},
{
"epoch": 1.6703462117410939,
"grad_norm": 0.44239112700740263,
"learning_rate": 8.034052052588076e-07,
"loss": 0.4764,
"mean_token_accuracy": 0.8538721084594727,
"step": 2080
},
{
"epoch": 1.6743602609131962,
"grad_norm": 0.4326153640002424,
"learning_rate": 7.844556498445788e-07,
"loss": 0.4774,
"mean_token_accuracy": 0.8538077279925347,
"step": 2085
},
{
"epoch": 1.6783743100852986,
"grad_norm": 0.4601280688859696,
"learning_rate": 7.657132077003599e-07,
"loss": 0.4705,
"mean_token_accuracy": 0.8558229163289071,
"step": 2090
},
{
"epoch": 1.682388359257401,
"grad_norm": 0.44335461441524826,
"learning_rate": 7.471787996582358e-07,
"loss": 0.4709,
"mean_token_accuracy": 0.8559122830629349,
"step": 2095
},
{
"epoch": 1.6864024084295033,
"grad_norm": 0.46533113083334554,
"learning_rate": 7.288533363293959e-07,
"loss": 0.4717,
"mean_token_accuracy": 0.855779829621315,
"step": 2100
},
{
"epoch": 1.6904164576016056,
"grad_norm": 0.4345554569585679,
"learning_rate": 7.107377180593994e-07,
"loss": 0.4761,
"mean_token_accuracy": 0.8539123505353927,
"step": 2105
},
{
"epoch": 1.694430506773708,
"grad_norm": 0.4537776341024369,
"learning_rate": 6.928328348839392e-07,
"loss": 0.473,
"mean_token_accuracy": 0.8550264790654183,
"step": 2110
},
{
"epoch": 1.6984445559458103,
"grad_norm": 0.4608039104937001,
"learning_rate": 6.751395664851135e-07,
"loss": 0.4772,
"mean_token_accuracy": 0.8542622447013855,
"step": 2115
},
{
"epoch": 1.7024586051179127,
"grad_norm": 0.45531945033095894,
"learning_rate": 6.5765878214821e-07,
"loss": 0.477,
"mean_token_accuracy": 0.8539689004421234,
"step": 2120
},
{
"epoch": 1.706472654290015,
"grad_norm": 0.4488368557626286,
"learning_rate": 6.403913407189921e-07,
"loss": 0.4706,
"mean_token_accuracy": 0.8554460853338242,
"step": 2125
},
{
"epoch": 1.7104867034621174,
"grad_norm": 0.4511207312852134,
"learning_rate": 6.233380905615049e-07,
"loss": 0.4737,
"mean_token_accuracy": 0.8550473853945733,
"step": 2130
},
{
"epoch": 1.7145007526342197,
"grad_norm": 0.4831341748488721,
"learning_rate": 6.064998695163948e-07,
"loss": 0.4733,
"mean_token_accuracy": 0.8551791489124299,
"step": 2135
},
{
"epoch": 1.718514801806322,
"grad_norm": 0.4492097445567558,
"learning_rate": 5.898775048597449e-07,
"loss": 0.4778,
"mean_token_accuracy": 0.8540239587426186,
"step": 2140
},
{
"epoch": 1.7225288509784245,
"grad_norm": 0.43196711826065015,
"learning_rate": 5.734718132624351e-07,
"loss": 0.4701,
"mean_token_accuracy": 0.8554452732205391,
"step": 2145
},
{
"epoch": 1.7265429001505268,
"grad_norm": 0.4481914858588619,
"learning_rate": 5.57283600750006e-07,
"loss": 0.4749,
"mean_token_accuracy": 0.854470057785511,
"step": 2150
},
{
"epoch": 1.7305569493226292,
"grad_norm": 0.442688331520453,
"learning_rate": 5.41313662663075e-07,
"loss": 0.4813,
"mean_token_accuracy": 0.8537053450942039,
"step": 2155
},
{
"epoch": 1.7345709984947315,
"grad_norm": 0.45437110406179004,
"learning_rate": 5.255627836182453e-07,
"loss": 0.4742,
"mean_token_accuracy": 0.8547719925642013,
"step": 2160
},
{
"epoch": 1.738585047666834,
"grad_norm": 0.4525783315312203,
"learning_rate": 5.100317374695673e-07,
"loss": 0.4712,
"mean_token_accuracy": 0.854958064854145,
"step": 2165
},
{
"epoch": 1.7425990968389362,
"grad_norm": 0.45989644907653204,
"learning_rate": 4.947212872705131e-07,
"loss": 0.4844,
"mean_token_accuracy": 0.8522865787148476,
"step": 2170
},
{
"epoch": 1.7466131460110388,
"grad_norm": 0.44179078654846465,
"learning_rate": 4.796321852364877e-07,
"loss": 0.4726,
"mean_token_accuracy": 0.8552617311477662,
"step": 2175
},
{
"epoch": 1.750627195183141,
"grad_norm": 0.44502137847599055,
"learning_rate": 4.6476517270787667e-07,
"loss": 0.4755,
"mean_token_accuracy": 0.8547576576471329,
"step": 2180
},
{
"epoch": 1.7546412443552435,
"grad_norm": 0.4293509834801368,
"learning_rate": 4.5012098011361583e-07,
"loss": 0.475,
"mean_token_accuracy": 0.8545479908585548,
"step": 2185
},
{
"epoch": 1.7586552935273456,
"grad_norm": 0.4476896792512822,
"learning_rate": 4.357003269353105e-07,
"loss": 0.4752,
"mean_token_accuracy": 0.854350033402443,
"step": 2190
},
{
"epoch": 1.7626693426994482,
"grad_norm": 0.43152267828647745,
"learning_rate": 4.215039216718847e-07,
"loss": 0.4794,
"mean_token_accuracy": 0.8537008062005043,
"step": 2195
},
{
"epoch": 1.7666833918715503,
"grad_norm": 0.47430915815921837,
"learning_rate": 4.075324618047705e-07,
"loss": 0.4758,
"mean_token_accuracy": 0.8541364178061486,
"step": 2200
},
{
"epoch": 1.770697441043653,
"grad_norm": 0.4287695621483679,
"learning_rate": 3.937866337636459e-07,
"loss": 0.4656,
"mean_token_accuracy": 0.8566897332668304,
"step": 2205
},
{
"epoch": 1.774711490215755,
"grad_norm": 0.43111942298665074,
"learning_rate": 3.802671128927016e-07,
"loss": 0.476,
"mean_token_accuracy": 0.854565116763115,
"step": 2210
},
{
"epoch": 1.7787255393878576,
"grad_norm": 0.436167995973845,
"learning_rate": 3.6697456341746706e-07,
"loss": 0.4729,
"mean_token_accuracy": 0.8552428483963013,
"step": 2215
},
{
"epoch": 1.7827395885599597,
"grad_norm": 0.4415001621236335,
"learning_rate": 3.539096384121743e-07,
"loss": 0.4673,
"mean_token_accuracy": 0.8565417811274528,
"step": 2220
},
{
"epoch": 1.7867536377320623,
"grad_norm": 0.4415379193192118,
"learning_rate": 3.4107297976767097e-07,
"loss": 0.4696,
"mean_token_accuracy": 0.855958080291748,
"step": 2225
},
{
"epoch": 1.7907676869041644,
"grad_norm": 0.4451028631056662,
"learning_rate": 3.2846521815988853e-07,
"loss": 0.4806,
"mean_token_accuracy": 0.8532564789056778,
"step": 2230
},
{
"epoch": 1.794781736076267,
"grad_norm": 0.42848978049456043,
"learning_rate": 3.160869730188465e-07,
"loss": 0.4753,
"mean_token_accuracy": 0.8544053509831429,
"step": 2235
},
{
"epoch": 1.7987957852483691,
"grad_norm": 0.4358873015501897,
"learning_rate": 3.0393885249823174e-07,
"loss": 0.4708,
"mean_token_accuracy": 0.8561656758189201,
"step": 2240
},
{
"epoch": 1.8028098344204717,
"grad_norm": 0.4235541886416018,
"learning_rate": 2.92021453445509e-07,
"loss": 0.4813,
"mean_token_accuracy": 0.8529417350888252,
"step": 2245
},
{
"epoch": 1.8068238835925738,
"grad_norm": 0.4835046099418169,
"learning_rate": 2.8033536137260565e-07,
"loss": 0.4783,
"mean_token_accuracy": 0.8537642017006875,
"step": 2250
},
{
"epoch": 1.8108379327646764,
"grad_norm": 0.42897346841833367,
"learning_rate": 2.688811504271371e-07,
"loss": 0.4729,
"mean_token_accuracy": 0.8553988888859749,
"step": 2255
},
{
"epoch": 1.8148519819367788,
"grad_norm": 0.4301559523428082,
"learning_rate": 2.576593833642033e-07,
"loss": 0.4761,
"mean_token_accuracy": 0.8536762282252311,
"step": 2260
},
{
"epoch": 1.8188660311088811,
"grad_norm": 0.4369762786857926,
"learning_rate": 2.466706115187406e-07,
"loss": 0.4755,
"mean_token_accuracy": 0.854819355905056,
"step": 2265
},
{
"epoch": 1.8228800802809835,
"grad_norm": 0.44612227390075465,
"learning_rate": 2.3591537477843208e-07,
"loss": 0.4755,
"mean_token_accuracy": 0.854425060749054,
"step": 2270
},
{
"epoch": 1.8268941294530858,
"grad_norm": 0.44247042993963465,
"learning_rate": 2.253942015571814e-07,
"loss": 0.474,
"mean_token_accuracy": 0.8552328020334243,
"step": 2275
},
{
"epoch": 1.8309081786251882,
"grad_norm": 0.4578633638360693,
"learning_rate": 2.1510760876915505e-07,
"loss": 0.4774,
"mean_token_accuracy": 0.8542596101760864,
"step": 2280
},
{
"epoch": 1.8349222277972905,
"grad_norm": 0.43669293127724457,
"learning_rate": 2.0505610180338198e-07,
"loss": 0.4717,
"mean_token_accuracy": 0.855582419037819,
"step": 2285
},
{
"epoch": 1.8389362769693929,
"grad_norm": 0.4239247527956778,
"learning_rate": 1.952401744989274e-07,
"loss": 0.4654,
"mean_token_accuracy": 0.85718754529953,
"step": 2290
},
{
"epoch": 1.8429503261414952,
"grad_norm": 0.46610653318906003,
"learning_rate": 1.856603091206255e-07,
"loss": 0.4788,
"mean_token_accuracy": 0.8535729303956032,
"step": 2295
},
{
"epoch": 1.8469643753135976,
"grad_norm": 0.42340902296366656,
"learning_rate": 1.7631697633539058e-07,
"loss": 0.4685,
"mean_token_accuracy": 0.8563544407486916,
"step": 2300
},
{
"epoch": 1.8509784244857,
"grad_norm": 0.4524402197663993,
"learning_rate": 1.672106351890862e-07,
"loss": 0.4762,
"mean_token_accuracy": 0.8543719783425331,
"step": 2305
},
{
"epoch": 1.8549924736578023,
"grad_norm": 0.44808822341167676,
"learning_rate": 1.583417330839798e-07,
"loss": 0.4779,
"mean_token_accuracy": 0.8537597358226776,
"step": 2310
},
{
"epoch": 1.8590065228299046,
"grad_norm": 0.4274978896560569,
"learning_rate": 1.4971070575675538e-07,
"loss": 0.4825,
"mean_token_accuracy": 0.8530182659626007,
"step": 2315
},
{
"epoch": 1.863020572002007,
"grad_norm": 0.41753329575556447,
"learning_rate": 1.413179772571055e-07,
"loss": 0.4769,
"mean_token_accuracy": 0.8544663786888123,
"step": 2320
},
{
"epoch": 1.8670346211741093,
"grad_norm": 0.46270689148121547,
"learning_rate": 1.3316395992690302e-07,
"loss": 0.4831,
"mean_token_accuracy": 0.8527223974466324,
"step": 2325
},
{
"epoch": 1.8710486703462117,
"grad_norm": 0.4699440162702181,
"learning_rate": 1.252490543799345e-07,
"loss": 0.4664,
"mean_token_accuracy": 0.8564791366457939,
"step": 2330
},
{
"epoch": 1.875062719518314,
"grad_norm": 0.4222369586797418,
"learning_rate": 1.175736494822266e-07,
"loss": 0.4704,
"mean_token_accuracy": 0.855928809940815,
"step": 2335
},
{
"epoch": 1.8790767686904166,
"grad_norm": 0.4301011412110541,
"learning_rate": 1.1013812233293008e-07,
"loss": 0.4666,
"mean_token_accuracy": 0.8567720711231231,
"step": 2340
},
{
"epoch": 1.8830908178625188,
"grad_norm": 0.4326138437671182,
"learning_rate": 1.0294283824580309e-07,
"loss": 0.4832,
"mean_token_accuracy": 0.8523061379790307,
"step": 2345
},
{
"epoch": 1.8871048670346213,
"grad_norm": 0.4497012617910045,
"learning_rate": 9.5988150731256e-08,
"loss": 0.4745,
"mean_token_accuracy": 0.8544217735528946,
"step": 2350
},
{
"epoch": 1.8911189162067235,
"grad_norm": 0.44164898887491794,
"learning_rate": 8.927440147898703e-08,
"loss": 0.4631,
"mean_token_accuracy": 0.8575935378670693,
"step": 2355
},
{
"epoch": 1.895132965378826,
"grad_norm": 0.4230848433661066,
"learning_rate": 8.280192034119116e-08,
"loss": 0.4736,
"mean_token_accuracy": 0.8550165235996247,
"step": 2360
},
{
"epoch": 1.8991470145509282,
"grad_norm": 0.4257386932133046,
"learning_rate": 7.657102531635762e-08,
"loss": 0.4757,
"mean_token_accuracy": 0.8543663218617439,
"step": 2365
},
{
"epoch": 1.9031610637230307,
"grad_norm": 0.4407975270333358,
"learning_rate": 7.058202253364511e-08,
"loss": 0.4794,
"mean_token_accuracy": 0.8536389827728271,
"step": 2370
},
{
"epoch": 1.9071751128951329,
"grad_norm": 0.42934669175844953,
"learning_rate": 6.4835206237841e-08,
"loss": 0.4742,
"mean_token_accuracy": 0.85507842451334,
"step": 2375
},
{
"epoch": 1.9111891620672354,
"grad_norm": 0.43251041025298353,
"learning_rate": 5.933085877490474e-08,
"loss": 0.4744,
"mean_token_accuracy": 0.8547703847289085,
"step": 2380
},
{
"epoch": 1.9152032112393376,
"grad_norm": 0.43605508973639817,
"learning_rate": 5.406925057809653e-08,
"loss": 0.4748,
"mean_token_accuracy": 0.8543973922729492,
"step": 2385
},
{
"epoch": 1.9192172604114401,
"grad_norm": 0.43786161881380403,
"learning_rate": 4.9050640154690297e-08,
"loss": 0.4705,
"mean_token_accuracy": 0.8555782288312912,
"step": 2390
},
{
"epoch": 1.9232313095835423,
"grad_norm": 0.4488612147870018,
"learning_rate": 4.427527407327381e-08,
"loss": 0.4825,
"mean_token_accuracy": 0.852660457789898,
"step": 2395
},
{
"epoch": 1.9272453587556448,
"grad_norm": 0.41818464639030534,
"learning_rate": 3.974338695163393e-08,
"loss": 0.4826,
"mean_token_accuracy": 0.8533532917499542,
"step": 2400
},
{
"epoch": 1.931259407927747,
"grad_norm": 0.43318994140619926,
"learning_rate": 3.5455201445228625e-08,
"loss": 0.4721,
"mean_token_accuracy": 0.8555899515748024,
"step": 2405
},
{
"epoch": 1.9352734570998495,
"grad_norm": 0.4350348341914019,
"learning_rate": 3.141092823625014e-08,
"loss": 0.4753,
"mean_token_accuracy": 0.8547878980636596,
"step": 2410
},
{
"epoch": 1.9392875062719517,
"grad_norm": 0.437156964872597,
"learning_rate": 2.7610766023271618e-08,
"loss": 0.4642,
"mean_token_accuracy": 0.8575856953859329,
"step": 2415
},
{
"epoch": 1.9433015554440543,
"grad_norm": 0.4439443740172379,
"learning_rate": 2.405490151148715e-08,
"loss": 0.4816,
"mean_token_accuracy": 0.852755481004715,
"step": 2420
},
{
"epoch": 1.9473156046161564,
"grad_norm": 0.4243080967093221,
"learning_rate": 2.07435094035352e-08,
"loss": 0.4713,
"mean_token_accuracy": 0.8551632657647132,
"step": 2425
},
{
"epoch": 1.951329653788259,
"grad_norm": 0.4417367451533877,
"learning_rate": 1.7676752390920482e-08,
"loss": 0.4693,
"mean_token_accuracy": 0.8561436429619789,
"step": 2430
},
{
"epoch": 1.9553437029603613,
"grad_norm": 0.5506523758872923,
"learning_rate": 1.4854781146015906e-08,
"loss": 0.472,
"mean_token_accuracy": 0.8551851272583008,
"step": 2435
},
{
"epoch": 1.9593577521324637,
"grad_norm": 0.4172594879961426,
"learning_rate": 1.2277734314662948e-08,
"loss": 0.4648,
"mean_token_accuracy": 0.8577016338706016,
"step": 2440
},
{
"epoch": 1.963371801304566,
"grad_norm": 0.43928189202839146,
"learning_rate": 9.945738509358205e-09,
"loss": 0.4751,
"mean_token_accuracy": 0.8543354839086532,
"step": 2445
},
{
"epoch": 1.9673858504766684,
"grad_norm": 0.46064111484894527,
"learning_rate": 7.85890830303393e-09,
"loss": 0.4799,
"mean_token_accuracy": 0.8528526350855827,
"step": 2450
},
{
"epoch": 1.9713998996487707,
"grad_norm": 0.4273528805773524,
"learning_rate": 6.017346223429199e-09,
"loss": 0.473,
"mean_token_accuracy": 0.855083005130291,
"step": 2455
},
{
"epoch": 1.975413948820873,
"grad_norm": 0.41903611556825693,
"learning_rate": 4.421142748050056e-09,
"loss": 0.4722,
"mean_token_accuracy": 0.8553782507777214,
"step": 2460
},
{
"epoch": 1.9794279979929754,
"grad_norm": 0.40693132138961724,
"learning_rate": 3.070376299728062e-09,
"loss": 0.4768,
"mean_token_accuracy": 0.8542209342122078,
"step": 2465
},
{
"epoch": 1.9834420471650778,
"grad_norm": 0.45323616894670904,
"learning_rate": 1.965113242764494e-09,
"loss": 0.4797,
"mean_token_accuracy": 0.8531859144568443,
"step": 2470
},
{
"epoch": 1.9874560963371801,
"grad_norm": 0.4334834647690343,
"learning_rate": 1.105407879670728e-09,
"loss": 0.4766,
"mean_token_accuracy": 0.854351207613945,
"step": 2475
},
{
"epoch": 1.9914701455092825,
"grad_norm": 0.4426951792767383,
"learning_rate": 4.913024485003748e-10,
"loss": 0.4837,
"mean_token_accuracy": 0.852310574054718,
"step": 2480
},
{
"epoch": 1.9954841946813848,
"grad_norm": 0.4662710035176809,
"learning_rate": 1.2282712077538173e-10,
"loss": 0.476,
"mean_token_accuracy": 0.8544206082820892,
"step": 2485
},
{
"epoch": 1.9994982438534872,
"grad_norm": 0.44864109609424646,
"learning_rate": 0.0,
"loss": 0.4713,
"mean_token_accuracy": 0.8560152605175972,
"step": 2490
},
{
"epoch": 1.9994982438534872,
"step": 2490,
"total_flos": 1305393569464320.0,
"train_loss": 0.5340647190928938,
"train_runtime": 372903.5049,
"train_samples_per_second": 1.71,
"train_steps_per_second": 0.007
}
],
"logging_steps": 5,
"max_steps": 2490,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1305393569464320.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}