mbert_ron-latn / trainer_state.json
DGurgurov's picture
Uploading checkpoint-98000 for mbert - ron-latn
98afa07 verified
{
"best_metric": 0.7897204756736755,
"best_model_checkpoint": "./model_fine-tune/glot/mbert/ron-Latn/checkpoint-98000",
"epoch": 13.435700575815739,
"eval_steps": 500,
"global_step": 98000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06854949273375377,
"grad_norm": 4.000861644744873,
"learning_rate": 9.95e-05,
"loss": 1.7568,
"step": 500
},
{
"epoch": 0.06854949273375377,
"eval_accuracy": 0.7095725691837217,
"eval_loss": 1.6891406774520874,
"eval_runtime": 259.6177,
"eval_samples_per_second": 156.391,
"eval_steps_per_second": 4.888,
"step": 500
},
{
"epoch": 0.13709898546750754,
"grad_norm": 4.238007545471191,
"learning_rate": 9.900000000000001e-05,
"loss": 1.5655,
"step": 1000
},
{
"epoch": 0.13709898546750754,
"eval_accuracy": 0.7265132414723929,
"eval_loss": 1.5632870197296143,
"eval_runtime": 260.4774,
"eval_samples_per_second": 155.875,
"eval_steps_per_second": 4.872,
"step": 1000
},
{
"epoch": 0.2056484782012613,
"grad_norm": 3.5532774925231934,
"learning_rate": 9.850000000000001e-05,
"loss": 1.4944,
"step": 1500
},
{
"epoch": 0.2056484782012613,
"eval_accuracy": 0.7360105726108715,
"eval_loss": 1.4989327192306519,
"eval_runtime": 260.0937,
"eval_samples_per_second": 156.105,
"eval_steps_per_second": 4.879,
"step": 1500
},
{
"epoch": 0.2741979709350151,
"grad_norm": 3.5770881175994873,
"learning_rate": 9.8e-05,
"loss": 1.4324,
"step": 2000
},
{
"epoch": 0.2741979709350151,
"eval_accuracy": 0.743094448707599,
"eval_loss": 1.4517085552215576,
"eval_runtime": 260.1265,
"eval_samples_per_second": 156.086,
"eval_steps_per_second": 4.878,
"step": 2000
},
{
"epoch": 0.34274746366876885,
"grad_norm": 3.5476059913635254,
"learning_rate": 9.75e-05,
"loss": 1.3797,
"step": 2500
},
{
"epoch": 0.34274746366876885,
"eval_accuracy": 0.7503641171887161,
"eval_loss": 1.4144247770309448,
"eval_runtime": 258.9987,
"eval_samples_per_second": 156.765,
"eval_steps_per_second": 4.9,
"step": 2500
},
{
"epoch": 0.4112969564025226,
"grad_norm": 3.64892578125,
"learning_rate": 9.7e-05,
"loss": 1.3435,
"step": 3000
},
{
"epoch": 0.4112969564025226,
"eval_accuracy": 0.7549876281769841,
"eval_loss": 1.380112648010254,
"eval_runtime": 258.9885,
"eval_samples_per_second": 156.771,
"eval_steps_per_second": 4.9,
"step": 3000
},
{
"epoch": 0.4798464491362764,
"grad_norm": 3.1375253200531006,
"learning_rate": 9.65e-05,
"loss": 1.3133,
"step": 3500
},
{
"epoch": 0.4798464491362764,
"eval_accuracy": 0.7577493236618825,
"eval_loss": 1.374040126800537,
"eval_runtime": 259.0671,
"eval_samples_per_second": 156.724,
"eval_steps_per_second": 4.898,
"step": 3500
},
{
"epoch": 0.5483959418700302,
"grad_norm": 3.1487910747528076,
"learning_rate": 9.6e-05,
"loss": 1.2891,
"step": 4000
},
{
"epoch": 0.5483959418700302,
"eval_accuracy": 0.76247168256945,
"eval_loss": 1.34718918800354,
"eval_runtime": 259.0025,
"eval_samples_per_second": 156.763,
"eval_steps_per_second": 4.9,
"step": 4000
},
{
"epoch": 0.6169454346037839,
"grad_norm": 2.947143316268921,
"learning_rate": 9.55e-05,
"loss": 1.2764,
"step": 4500
},
{
"epoch": 0.6169454346037839,
"eval_accuracy": 0.7652144022180228,
"eval_loss": 1.3174166679382324,
"eval_runtime": 258.6684,
"eval_samples_per_second": 156.965,
"eval_steps_per_second": 4.906,
"step": 4500
},
{
"epoch": 0.6854949273375377,
"grad_norm": 3.274010181427002,
"learning_rate": 9.5e-05,
"loss": 1.2492,
"step": 5000
},
{
"epoch": 0.6854949273375377,
"eval_accuracy": 0.7671819928468485,
"eval_loss": 1.313983678817749,
"eval_runtime": 258.6625,
"eval_samples_per_second": 156.969,
"eval_steps_per_second": 4.906,
"step": 5000
},
{
"epoch": 0.7540444200712915,
"grad_norm": 3.109161138534546,
"learning_rate": 9.449999999999999e-05,
"loss": 1.2329,
"step": 5500
},
{
"epoch": 0.7540444200712915,
"eval_accuracy": 0.7713562268262496,
"eval_loss": 1.2868432998657227,
"eval_runtime": 260.0165,
"eval_samples_per_second": 156.152,
"eval_steps_per_second": 4.88,
"step": 5500
},
{
"epoch": 0.8225939128050452,
"grad_norm": 7.044505596160889,
"learning_rate": 9.4e-05,
"loss": 1.2232,
"step": 6000
},
{
"epoch": 0.8225939128050452,
"eval_accuracy": 0.7731761649539602,
"eval_loss": 1.2668291330337524,
"eval_runtime": 258.9434,
"eval_samples_per_second": 156.799,
"eval_steps_per_second": 4.901,
"step": 6000
},
{
"epoch": 0.891143405538799,
"grad_norm": 3.029754400253296,
"learning_rate": 9.350000000000001e-05,
"loss": 1.2117,
"step": 6500
},
{
"epoch": 0.891143405538799,
"eval_accuracy": 0.7750832812825318,
"eval_loss": 1.2626760005950928,
"eval_runtime": 260.4421,
"eval_samples_per_second": 155.896,
"eval_steps_per_second": 4.872,
"step": 6500
},
{
"epoch": 0.9596928982725528,
"grad_norm": 2.961531639099121,
"learning_rate": 9.300000000000001e-05,
"loss": 1.1924,
"step": 7000
},
{
"epoch": 0.9596928982725528,
"eval_accuracy": 0.7771516510947648,
"eval_loss": 1.2538079023361206,
"eval_runtime": 260.0863,
"eval_samples_per_second": 156.11,
"eval_steps_per_second": 4.879,
"step": 7000
},
{
"epoch": 1.0282423910063065,
"grad_norm": 3.120314121246338,
"learning_rate": 9.250000000000001e-05,
"loss": 1.1807,
"step": 7500
},
{
"epoch": 1.0282423910063065,
"eval_accuracy": 0.7786684520598107,
"eval_loss": 1.247827410697937,
"eval_runtime": 258.2089,
"eval_samples_per_second": 157.245,
"eval_steps_per_second": 4.915,
"step": 7500
},
{
"epoch": 1.0967918837400603,
"grad_norm": 3.0152571201324463,
"learning_rate": 9.200000000000001e-05,
"loss": 1.1666,
"step": 8000
},
{
"epoch": 1.0967918837400603,
"eval_accuracy": 0.7806228052923374,
"eval_loss": 1.2236727476119995,
"eval_runtime": 258.1981,
"eval_samples_per_second": 157.251,
"eval_steps_per_second": 4.915,
"step": 8000
},
{
"epoch": 1.165341376473814,
"grad_norm": 4.560582637786865,
"learning_rate": 9.15e-05,
"loss": 1.1582,
"step": 8500
},
{
"epoch": 1.165341376473814,
"eval_accuracy": 0.7811331658792164,
"eval_loss": 1.2169686555862427,
"eval_runtime": 257.413,
"eval_samples_per_second": 157.731,
"eval_steps_per_second": 4.93,
"step": 8500
},
{
"epoch": 1.2338908692075679,
"grad_norm": 2.940659523010254,
"learning_rate": 9.1e-05,
"loss": 1.1376,
"step": 9000
},
{
"epoch": 1.2338908692075679,
"eval_accuracy": 0.7844609998371601,
"eval_loss": 1.1992230415344238,
"eval_runtime": 257.3928,
"eval_samples_per_second": 157.743,
"eval_steps_per_second": 4.93,
"step": 9000
},
{
"epoch": 1.3024403619413216,
"grad_norm": 3.4134812355041504,
"learning_rate": 9.05e-05,
"loss": 1.1358,
"step": 9500
},
{
"epoch": 1.3024403619413216,
"eval_accuracy": 0.7851634886822189,
"eval_loss": 1.2106844186782837,
"eval_runtime": 257.3145,
"eval_samples_per_second": 157.791,
"eval_steps_per_second": 4.932,
"step": 9500
},
{
"epoch": 1.3709898546750754,
"grad_norm": 2.8850438594818115,
"learning_rate": 9e-05,
"loss": 1.1281,
"step": 10000
},
{
"epoch": 1.3709898546750754,
"eval_accuracy": 0.7872160784699163,
"eval_loss": 1.2045215368270874,
"eval_runtime": 257.4649,
"eval_samples_per_second": 157.699,
"eval_steps_per_second": 4.929,
"step": 10000
},
{
"epoch": 1.4395393474088292,
"grad_norm": 2.752389669418335,
"learning_rate": 8.950000000000001e-05,
"loss": 1.119,
"step": 10500
},
{
"epoch": 1.4395393474088292,
"eval_accuracy": 0.7882877785798362,
"eval_loss": 1.1866884231567383,
"eval_runtime": 257.0898,
"eval_samples_per_second": 157.929,
"eval_steps_per_second": 4.936,
"step": 10500
},
{
"epoch": 1.508088840142583,
"grad_norm": 3.4821131229400635,
"learning_rate": 8.900000000000001e-05,
"loss": 1.1052,
"step": 11000
},
{
"epoch": 1.508088840142583,
"eval_accuracy": 0.7891213465142315,
"eval_loss": 1.177182912826538,
"eval_runtime": 257.3303,
"eval_samples_per_second": 157.782,
"eval_steps_per_second": 4.931,
"step": 11000
},
{
"epoch": 1.5766383328763367,
"grad_norm": 2.641080379486084,
"learning_rate": 8.850000000000001e-05,
"loss": 1.0969,
"step": 11500
},
{
"epoch": 1.5766383328763367,
"eval_accuracy": 0.790978984380183,
"eval_loss": 1.1677839756011963,
"eval_runtime": 257.8775,
"eval_samples_per_second": 157.447,
"eval_steps_per_second": 4.921,
"step": 11500
},
{
"epoch": 1.6451878256100905,
"grad_norm": 2.670703172683716,
"learning_rate": 8.800000000000001e-05,
"loss": 1.0973,
"step": 12000
},
{
"epoch": 1.6451878256100905,
"eval_accuracy": 0.7914186279008998,
"eval_loss": 1.1657401323318481,
"eval_runtime": 257.6044,
"eval_samples_per_second": 157.614,
"eval_steps_per_second": 4.926,
"step": 12000
},
{
"epoch": 1.7137373183438442,
"grad_norm": 4.184514045715332,
"learning_rate": 8.75e-05,
"loss": 1.0931,
"step": 12500
},
{
"epoch": 1.7137373183438442,
"eval_accuracy": 0.7924260144852202,
"eval_loss": 1.1656056642532349,
"eval_runtime": 257.7225,
"eval_samples_per_second": 157.542,
"eval_steps_per_second": 4.924,
"step": 12500
},
{
"epoch": 1.782286811077598,
"grad_norm": 3.4420437812805176,
"learning_rate": 8.7e-05,
"loss": 1.0882,
"step": 13000
},
{
"epoch": 1.782286811077598,
"eval_accuracy": 0.7937018737958299,
"eval_loss": 1.159055471420288,
"eval_runtime": 257.6253,
"eval_samples_per_second": 157.601,
"eval_steps_per_second": 4.926,
"step": 13000
},
{
"epoch": 1.8508363038113518,
"grad_norm": 2.942854642868042,
"learning_rate": 8.65e-05,
"loss": 1.0802,
"step": 13500
},
{
"epoch": 1.8508363038113518,
"eval_accuracy": 0.7948528675102821,
"eval_loss": 1.1472080945968628,
"eval_runtime": 257.8341,
"eval_samples_per_second": 157.473,
"eval_steps_per_second": 4.922,
"step": 13500
},
{
"epoch": 1.9193857965451055,
"grad_norm": 2.382511854171753,
"learning_rate": 8.6e-05,
"loss": 1.0766,
"step": 14000
},
{
"epoch": 1.9193857965451055,
"eval_accuracy": 0.7966625616136973,
"eval_loss": 1.1392544507980347,
"eval_runtime": 258.8105,
"eval_samples_per_second": 156.879,
"eval_steps_per_second": 4.903,
"step": 14000
},
{
"epoch": 1.9879352892788593,
"grad_norm": 2.583773374557495,
"learning_rate": 8.55e-05,
"loss": 1.0719,
"step": 14500
},
{
"epoch": 1.9879352892788593,
"eval_accuracy": 0.7972140563559621,
"eval_loss": 1.1423135995864868,
"eval_runtime": 257.6602,
"eval_samples_per_second": 157.58,
"eval_steps_per_second": 4.925,
"step": 14500
},
{
"epoch": 2.056484782012613,
"grad_norm": 2.788512945175171,
"learning_rate": 8.5e-05,
"loss": 1.0553,
"step": 15000
},
{
"epoch": 2.056484782012613,
"eval_accuracy": 0.7980805136441053,
"eval_loss": 1.128947377204895,
"eval_runtime": 257.7272,
"eval_samples_per_second": 157.539,
"eval_steps_per_second": 4.924,
"step": 15000
},
{
"epoch": 2.125034274746367,
"grad_norm": 2.9311256408691406,
"learning_rate": 8.450000000000001e-05,
"loss": 1.0451,
"step": 15500
},
{
"epoch": 2.125034274746367,
"eval_accuracy": 0.7988239883829299,
"eval_loss": 1.1336219310760498,
"eval_runtime": 258.1352,
"eval_samples_per_second": 157.29,
"eval_steps_per_second": 4.916,
"step": 15500
},
{
"epoch": 2.1935837674801206,
"grad_norm": 3.1457791328430176,
"learning_rate": 8.4e-05,
"loss": 1.0423,
"step": 16000
},
{
"epoch": 2.1935837674801206,
"eval_accuracy": 0.7996375561822584,
"eval_loss": 1.1223351955413818,
"eval_runtime": 258.998,
"eval_samples_per_second": 156.766,
"eval_steps_per_second": 4.9,
"step": 16000
},
{
"epoch": 2.2621332602138744,
"grad_norm": 2.9234752655029297,
"learning_rate": 8.35e-05,
"loss": 1.0449,
"step": 16500
},
{
"epoch": 2.2621332602138744,
"eval_accuracy": 0.8002099641553145,
"eval_loss": 1.1050431728363037,
"eval_runtime": 258.0627,
"eval_samples_per_second": 157.334,
"eval_steps_per_second": 4.917,
"step": 16500
},
{
"epoch": 2.330682752947628,
"grad_norm": 2.9982569217681885,
"learning_rate": 8.3e-05,
"loss": 1.0312,
"step": 17000
},
{
"epoch": 2.330682752947628,
"eval_accuracy": 0.8017049854116234,
"eval_loss": 1.1168311834335327,
"eval_runtime": 258.8729,
"eval_samples_per_second": 156.841,
"eval_steps_per_second": 4.902,
"step": 17000
},
{
"epoch": 2.399232245681382,
"grad_norm": 2.63649845123291,
"learning_rate": 8.25e-05,
"loss": 1.0291,
"step": 17500
},
{
"epoch": 2.399232245681382,
"eval_accuracy": 0.8025579020578063,
"eval_loss": 1.1074473857879639,
"eval_runtime": 257.7749,
"eval_samples_per_second": 157.51,
"eval_steps_per_second": 4.923,
"step": 17500
},
{
"epoch": 2.4677817384151357,
"grad_norm": 3.122042417526245,
"learning_rate": 8.2e-05,
"loss": 1.0245,
"step": 18000
},
{
"epoch": 2.4677817384151357,
"eval_accuracy": 0.802570536529715,
"eval_loss": 1.1076058149337769,
"eval_runtime": 258.567,
"eval_samples_per_second": 157.027,
"eval_steps_per_second": 4.908,
"step": 18000
},
{
"epoch": 2.5363312311488895,
"grad_norm": 3.2931995391845703,
"learning_rate": 8.15e-05,
"loss": 1.0236,
"step": 18500
},
{
"epoch": 2.5363312311488895,
"eval_accuracy": 0.803701212821904,
"eval_loss": 1.092323660850525,
"eval_runtime": 257.5769,
"eval_samples_per_second": 157.631,
"eval_steps_per_second": 4.927,
"step": 18500
},
{
"epoch": 2.6048807238826432,
"grad_norm": 3.31691837310791,
"learning_rate": 8.1e-05,
"loss": 1.0218,
"step": 19000
},
{
"epoch": 2.6048807238826432,
"eval_accuracy": 0.8040252089042342,
"eval_loss": 1.0852642059326172,
"eval_runtime": 258.7184,
"eval_samples_per_second": 156.935,
"eval_steps_per_second": 4.905,
"step": 19000
},
{
"epoch": 2.673430216616397,
"grad_norm": 2.602132558822632,
"learning_rate": 8.05e-05,
"loss": 1.0101,
"step": 19500
},
{
"epoch": 2.673430216616397,
"eval_accuracy": 0.8049598014763766,
"eval_loss": 1.0851576328277588,
"eval_runtime": 259.1257,
"eval_samples_per_second": 156.688,
"eval_steps_per_second": 4.897,
"step": 19500
},
{
"epoch": 2.741979709350151,
"grad_norm": 2.6089420318603516,
"learning_rate": 8e-05,
"loss": 1.0154,
"step": 20000
},
{
"epoch": 2.741979709350151,
"eval_accuracy": 0.8060563142838197,
"eval_loss": 1.0850160121917725,
"eval_runtime": 257.7528,
"eval_samples_per_second": 157.523,
"eval_steps_per_second": 4.923,
"step": 20000
},
{
"epoch": 2.8105292020839046,
"grad_norm": 2.57804536819458,
"learning_rate": 7.950000000000001e-05,
"loss": 1.0092,
"step": 20500
},
{
"epoch": 2.8105292020839046,
"eval_accuracy": 0.8063966673859484,
"eval_loss": 1.064876675605774,
"eval_runtime": 257.3862,
"eval_samples_per_second": 157.747,
"eval_steps_per_second": 4.93,
"step": 20500
},
{
"epoch": 2.8790786948176583,
"grad_norm": 2.919243097305298,
"learning_rate": 7.900000000000001e-05,
"loss": 0.9962,
"step": 21000
},
{
"epoch": 2.8790786948176583,
"eval_accuracy": 0.8074074812889718,
"eval_loss": 1.0758228302001953,
"eval_runtime": 257.4397,
"eval_samples_per_second": 157.715,
"eval_steps_per_second": 4.929,
"step": 21000
},
{
"epoch": 2.947628187551412,
"grad_norm": 2.7142975330352783,
"learning_rate": 7.850000000000001e-05,
"loss": 0.9949,
"step": 21500
},
{
"epoch": 2.947628187551412,
"eval_accuracy": 0.8072095077707074,
"eval_loss": 1.0723259449005127,
"eval_runtime": 257.4852,
"eval_samples_per_second": 157.687,
"eval_steps_per_second": 4.928,
"step": 21500
},
{
"epoch": 3.016177680285166,
"grad_norm": 2.461714267730713,
"learning_rate": 7.800000000000001e-05,
"loss": 0.9933,
"step": 22000
},
{
"epoch": 3.016177680285166,
"eval_accuracy": 0.8092224160930719,
"eval_loss": 1.0564687252044678,
"eval_runtime": 262.5555,
"eval_samples_per_second": 154.642,
"eval_steps_per_second": 4.833,
"step": 22000
},
{
"epoch": 3.0847271730189196,
"grad_norm": 3.128793716430664,
"learning_rate": 7.75e-05,
"loss": 0.9751,
"step": 22500
},
{
"epoch": 3.0847271730189196,
"eval_accuracy": 0.8092534032416199,
"eval_loss": 1.0642082691192627,
"eval_runtime": 263.2784,
"eval_samples_per_second": 154.217,
"eval_steps_per_second": 4.82,
"step": 22500
},
{
"epoch": 3.1532766657526734,
"grad_norm": 2.560393810272217,
"learning_rate": 7.7e-05,
"loss": 0.9792,
"step": 23000
},
{
"epoch": 3.1532766657526734,
"eval_accuracy": 0.8098133712563933,
"eval_loss": 1.0663542747497559,
"eval_runtime": 279.6936,
"eval_samples_per_second": 145.166,
"eval_steps_per_second": 4.537,
"step": 23000
},
{
"epoch": 3.221826158486427,
"grad_norm": 2.993088483810425,
"learning_rate": 7.65e-05,
"loss": 0.9671,
"step": 23500
},
{
"epoch": 3.221826158486427,
"eval_accuracy": 0.8106374773041439,
"eval_loss": 1.0509235858917236,
"eval_runtime": 302.0862,
"eval_samples_per_second": 134.405,
"eval_steps_per_second": 4.201,
"step": 23500
},
{
"epoch": 3.290375651220181,
"grad_norm": 2.796325445175171,
"learning_rate": 7.6e-05,
"loss": 0.9667,
"step": 24000
},
{
"epoch": 3.290375651220181,
"eval_accuracy": 0.8117766899143959,
"eval_loss": 1.0434768199920654,
"eval_runtime": 302.1312,
"eval_samples_per_second": 134.385,
"eval_steps_per_second": 4.2,
"step": 24000
},
{
"epoch": 3.3589251439539347,
"grad_norm": 3.067168712615967,
"learning_rate": 7.55e-05,
"loss": 0.9676,
"step": 24500
},
{
"epoch": 3.3589251439539347,
"eval_accuracy": 0.811503557227325,
"eval_loss": 1.0434749126434326,
"eval_runtime": 303.0455,
"eval_samples_per_second": 133.98,
"eval_steps_per_second": 4.187,
"step": 24500
},
{
"epoch": 3.4274746366876885,
"grad_norm": 6.064915657043457,
"learning_rate": 7.500000000000001e-05,
"loss": 0.9659,
"step": 25000
},
{
"epoch": 3.4274746366876885,
"eval_accuracy": 0.8128527567965343,
"eval_loss": 1.0416052341461182,
"eval_runtime": 301.3987,
"eval_samples_per_second": 134.712,
"eval_steps_per_second": 4.21,
"step": 25000
},
{
"epoch": 3.4960241294214423,
"grad_norm": 2.412940740585327,
"learning_rate": 7.450000000000001e-05,
"loss": 0.9534,
"step": 25500
},
{
"epoch": 3.4960241294214423,
"eval_accuracy": 0.8132953171379946,
"eval_loss": 1.0385822057724,
"eval_runtime": 300.1799,
"eval_samples_per_second": 135.259,
"eval_steps_per_second": 4.227,
"step": 25500
},
{
"epoch": 3.564573622155196,
"grad_norm": 2.9293601512908936,
"learning_rate": 7.4e-05,
"loss": 0.9579,
"step": 26000
},
{
"epoch": 3.564573622155196,
"eval_accuracy": 0.8135014968664439,
"eval_loss": 1.0341033935546875,
"eval_runtime": 302.1905,
"eval_samples_per_second": 134.359,
"eval_steps_per_second": 4.199,
"step": 26000
},
{
"epoch": 3.63312311488895,
"grad_norm": 2.3857290744781494,
"learning_rate": 7.35e-05,
"loss": 0.9562,
"step": 26500
},
{
"epoch": 3.63312311488895,
"eval_accuracy": 0.8136511493390661,
"eval_loss": 1.0400645732879639,
"eval_runtime": 300.9192,
"eval_samples_per_second": 134.927,
"eval_steps_per_second": 4.217,
"step": 26500
},
{
"epoch": 3.7016726076227036,
"grad_norm": 2.8844683170318604,
"learning_rate": 7.3e-05,
"loss": 0.9581,
"step": 27000
},
{
"epoch": 3.7016726076227036,
"eval_accuracy": 0.814749558109949,
"eval_loss": 1.0379000902175903,
"eval_runtime": 300.8792,
"eval_samples_per_second": 134.945,
"eval_steps_per_second": 4.218,
"step": 27000
},
{
"epoch": 3.7702221003564573,
"grad_norm": 3.2288286685943604,
"learning_rate": 7.25e-05,
"loss": 0.9524,
"step": 27500
},
{
"epoch": 3.7702221003564573,
"eval_accuracy": 0.8149326212786671,
"eval_loss": 1.0268869400024414,
"eval_runtime": 303.3311,
"eval_samples_per_second": 133.854,
"eval_steps_per_second": 4.184,
"step": 27500
},
{
"epoch": 3.838771593090211,
"grad_norm": 2.84405255317688,
"learning_rate": 7.2e-05,
"loss": 0.9366,
"step": 28000
},
{
"epoch": 3.838771593090211,
"eval_accuracy": 0.8165028910386263,
"eval_loss": 1.0258753299713135,
"eval_runtime": 301.0226,
"eval_samples_per_second": 134.88,
"eval_steps_per_second": 4.216,
"step": 28000
},
{
"epoch": 3.907321085823965,
"grad_norm": 2.78871488571167,
"learning_rate": 7.15e-05,
"loss": 0.9489,
"step": 28500
},
{
"epoch": 3.907321085823965,
"eval_accuracy": 0.8153732124964858,
"eval_loss": 1.0232901573181152,
"eval_runtime": 303.0362,
"eval_samples_per_second": 133.984,
"eval_steps_per_second": 4.188,
"step": 28500
},
{
"epoch": 3.9758705785577186,
"grad_norm": 2.6128122806549072,
"learning_rate": 7.1e-05,
"loss": 0.9372,
"step": 29000
},
{
"epoch": 3.9758705785577186,
"eval_accuracy": 0.8167775740472835,
"eval_loss": 1.0158660411834717,
"eval_runtime": 305.0716,
"eval_samples_per_second": 133.09,
"eval_steps_per_second": 4.16,
"step": 29000
},
{
"epoch": 4.044420071291473,
"grad_norm": 2.4649457931518555,
"learning_rate": 7.05e-05,
"loss": 0.9389,
"step": 29500
},
{
"epoch": 4.044420071291473,
"eval_accuracy": 0.8169951094301297,
"eval_loss": 1.0179492235183716,
"eval_runtime": 302.9032,
"eval_samples_per_second": 134.043,
"eval_steps_per_second": 4.189,
"step": 29500
},
{
"epoch": 4.112969564025226,
"grad_norm": 2.637385845184326,
"learning_rate": 7e-05,
"loss": 0.9224,
"step": 30000
},
{
"epoch": 4.112969564025226,
"eval_accuracy": 0.8178961830392573,
"eval_loss": 1.0067319869995117,
"eval_runtime": 300.9419,
"eval_samples_per_second": 134.916,
"eval_steps_per_second": 4.217,
"step": 30000
},
{
"epoch": 4.18151905675898,
"grad_norm": 2.863875389099121,
"learning_rate": 6.95e-05,
"loss": 0.9205,
"step": 30500
},
{
"epoch": 4.18151905675898,
"eval_accuracy": 0.8184249147223241,
"eval_loss": 1.0113714933395386,
"eval_runtime": 301.5875,
"eval_samples_per_second": 134.628,
"eval_steps_per_second": 4.208,
"step": 30500
},
{
"epoch": 4.250068549492734,
"grad_norm": 2.4322681427001953,
"learning_rate": 6.9e-05,
"loss": 0.9247,
"step": 31000
},
{
"epoch": 4.250068549492734,
"eval_accuracy": 0.8186936629410247,
"eval_loss": 1.003678798675537,
"eval_runtime": 303.1297,
"eval_samples_per_second": 133.943,
"eval_steps_per_second": 4.186,
"step": 31000
},
{
"epoch": 4.318618042226488,
"grad_norm": 2.998030424118042,
"learning_rate": 6.850000000000001e-05,
"loss": 0.9178,
"step": 31500
},
{
"epoch": 4.318618042226488,
"eval_accuracy": 0.8182408058742606,
"eval_loss": 1.0021617412567139,
"eval_runtime": 302.4668,
"eval_samples_per_second": 134.236,
"eval_steps_per_second": 4.196,
"step": 31500
},
{
"epoch": 4.387167534960241,
"grad_norm": 2.5294859409332275,
"learning_rate": 6.800000000000001e-05,
"loss": 0.9176,
"step": 32000
},
{
"epoch": 4.387167534960241,
"eval_accuracy": 0.8199272207420885,
"eval_loss": 1.0028120279312134,
"eval_runtime": 301.3201,
"eval_samples_per_second": 134.747,
"eval_steps_per_second": 4.211,
"step": 32000
},
{
"epoch": 4.4557170276939955,
"grad_norm": 4.368305206298828,
"learning_rate": 6.750000000000001e-05,
"loss": 0.9187,
"step": 32500
},
{
"epoch": 4.4557170276939955,
"eval_accuracy": 0.8202180503701034,
"eval_loss": 1.0021815299987793,
"eval_runtime": 301.1897,
"eval_samples_per_second": 134.805,
"eval_steps_per_second": 4.213,
"step": 32500
},
{
"epoch": 4.524266520427749,
"grad_norm": 3.3230433464050293,
"learning_rate": 6.7e-05,
"loss": 0.9169,
"step": 33000
},
{
"epoch": 4.524266520427749,
"eval_accuracy": 0.8205231724765897,
"eval_loss": 0.9979987740516663,
"eval_runtime": 300.7001,
"eval_samples_per_second": 135.025,
"eval_steps_per_second": 4.22,
"step": 33000
},
{
"epoch": 4.592816013161503,
"grad_norm": 2.592043876647949,
"learning_rate": 6.65e-05,
"loss": 0.9125,
"step": 33500
},
{
"epoch": 4.592816013161503,
"eval_accuracy": 0.8206935126919565,
"eval_loss": 0.9938598871231079,
"eval_runtime": 301.9997,
"eval_samples_per_second": 134.444,
"eval_steps_per_second": 4.202,
"step": 33500
},
{
"epoch": 4.661365505895256,
"grad_norm": 2.446427345275879,
"learning_rate": 6.6e-05,
"loss": 0.9146,
"step": 34000
},
{
"epoch": 4.661365505895256,
"eval_accuracy": 0.8216566473447208,
"eval_loss": 0.9849461913108826,
"eval_runtime": 302.9739,
"eval_samples_per_second": 134.012,
"eval_steps_per_second": 4.188,
"step": 34000
},
{
"epoch": 4.72991499862901,
"grad_norm": 2.884946346282959,
"learning_rate": 6.55e-05,
"loss": 0.9018,
"step": 34500
},
{
"epoch": 4.72991499862901,
"eval_accuracy": 0.8217915724349476,
"eval_loss": 1.0003894567489624,
"eval_runtime": 300.4012,
"eval_samples_per_second": 135.159,
"eval_steps_per_second": 4.224,
"step": 34500
},
{
"epoch": 4.798464491362764,
"grad_norm": 2.8886282444000244,
"learning_rate": 6.500000000000001e-05,
"loss": 0.9014,
"step": 35000
},
{
"epoch": 4.798464491362764,
"eval_accuracy": 0.8224961720866916,
"eval_loss": 0.9889456629753113,
"eval_runtime": 301.0802,
"eval_samples_per_second": 134.854,
"eval_steps_per_second": 4.215,
"step": 35000
},
{
"epoch": 4.867013984096518,
"grad_norm": 2.473068952560425,
"learning_rate": 6.450000000000001e-05,
"loss": 0.8919,
"step": 35500
},
{
"epoch": 4.867013984096518,
"eval_accuracy": 0.822945545786959,
"eval_loss": 0.9848706722259521,
"eval_runtime": 303.9067,
"eval_samples_per_second": 133.6,
"eval_steps_per_second": 4.176,
"step": 35500
},
{
"epoch": 4.935563476830271,
"grad_norm": 3.0716209411621094,
"learning_rate": 6.400000000000001e-05,
"loss": 0.8993,
"step": 36000
},
{
"epoch": 4.935563476830271,
"eval_accuracy": 0.8222583681418539,
"eval_loss": 0.9929753541946411,
"eval_runtime": 299.5059,
"eval_samples_per_second": 135.563,
"eval_steps_per_second": 4.237,
"step": 36000
},
{
"epoch": 5.004112969564026,
"grad_norm": 2.3323957920074463,
"learning_rate": 6.35e-05,
"loss": 0.9009,
"step": 36500
},
{
"epoch": 5.004112969564026,
"eval_accuracy": 0.8241444257225273,
"eval_loss": 0.9798668622970581,
"eval_runtime": 299.3852,
"eval_samples_per_second": 135.618,
"eval_steps_per_second": 4.239,
"step": 36500
},
{
"epoch": 5.072662462297779,
"grad_norm": 2.7152209281921387,
"learning_rate": 6.3e-05,
"loss": 0.8843,
"step": 37000
},
{
"epoch": 5.072662462297779,
"eval_accuracy": 0.8234012575934279,
"eval_loss": 0.9811968803405762,
"eval_runtime": 300.6715,
"eval_samples_per_second": 135.038,
"eval_steps_per_second": 4.221,
"step": 37000
},
{
"epoch": 5.141211955031533,
"grad_norm": 2.526486396789551,
"learning_rate": 6.25e-05,
"loss": 0.8846,
"step": 37500
},
{
"epoch": 5.141211955031533,
"eval_accuracy": 0.8247555724795991,
"eval_loss": 0.9730820655822754,
"eval_runtime": 299.291,
"eval_samples_per_second": 135.661,
"eval_steps_per_second": 4.24,
"step": 37500
},
{
"epoch": 5.2097614477652865,
"grad_norm": 2.5805063247680664,
"learning_rate": 6.2e-05,
"loss": 0.8807,
"step": 38000
},
{
"epoch": 5.2097614477652865,
"eval_accuracy": 0.8250464067024924,
"eval_loss": 0.9684708118438721,
"eval_runtime": 302.4457,
"eval_samples_per_second": 134.246,
"eval_steps_per_second": 4.196,
"step": 38000
},
{
"epoch": 5.278310940499041,
"grad_norm": 2.559605360031128,
"learning_rate": 6.15e-05,
"loss": 0.8802,
"step": 38500
},
{
"epoch": 5.278310940499041,
"eval_accuracy": 0.8254996987535631,
"eval_loss": 0.973818838596344,
"eval_runtime": 300.146,
"eval_samples_per_second": 135.274,
"eval_steps_per_second": 4.228,
"step": 38500
},
{
"epoch": 5.346860433232794,
"grad_norm": 2.1615304946899414,
"learning_rate": 6.1e-05,
"loss": 0.8789,
"step": 39000
},
{
"epoch": 5.346860433232794,
"eval_accuracy": 0.8254549864960571,
"eval_loss": 0.9578101634979248,
"eval_runtime": 302.5653,
"eval_samples_per_second": 134.193,
"eval_steps_per_second": 4.194,
"step": 39000
},
{
"epoch": 5.415409925966548,
"grad_norm": 2.2763609886169434,
"learning_rate": 6.05e-05,
"loss": 0.8843,
"step": 39500
},
{
"epoch": 5.415409925966548,
"eval_accuracy": 0.8262787384248012,
"eval_loss": 0.9698151350021362,
"eval_runtime": 301.7649,
"eval_samples_per_second": 134.548,
"eval_steps_per_second": 4.205,
"step": 39500
},
{
"epoch": 5.483959418700302,
"grad_norm": 2.3774330615997314,
"learning_rate": 6e-05,
"loss": 0.8714,
"step": 40000
},
{
"epoch": 5.483959418700302,
"eval_accuracy": 0.8263406985859876,
"eval_loss": 0.9681651592254639,
"eval_runtime": 302.4298,
"eval_samples_per_second": 134.253,
"eval_steps_per_second": 4.196,
"step": 40000
},
{
"epoch": 5.552508911434055,
"grad_norm": 2.3430614471435547,
"learning_rate": 5.95e-05,
"loss": 0.8676,
"step": 40500
},
{
"epoch": 5.552508911434055,
"eval_accuracy": 0.8263813882499345,
"eval_loss": 0.9515417814254761,
"eval_runtime": 301.876,
"eval_samples_per_second": 134.499,
"eval_steps_per_second": 4.204,
"step": 40500
},
{
"epoch": 5.621058404167809,
"grad_norm": 2.3059141635894775,
"learning_rate": 5.9e-05,
"loss": 0.8721,
"step": 41000
},
{
"epoch": 5.621058404167809,
"eval_accuracy": 0.8275216377261674,
"eval_loss": 0.962243914604187,
"eval_runtime": 302.2447,
"eval_samples_per_second": 134.335,
"eval_steps_per_second": 4.199,
"step": 41000
},
{
"epoch": 5.689607896901563,
"grad_norm": 2.462218999862671,
"learning_rate": 5.85e-05,
"loss": 0.8699,
"step": 41500
},
{
"epoch": 5.689607896901563,
"eval_accuracy": 0.8283203266406056,
"eval_loss": 0.9477165341377258,
"eval_runtime": 302.7485,
"eval_samples_per_second": 134.111,
"eval_steps_per_second": 4.192,
"step": 41500
},
{
"epoch": 5.758157389635317,
"grad_norm": 3.0347349643707275,
"learning_rate": 5.8e-05,
"loss": 0.8634,
"step": 42000
},
{
"epoch": 5.758157389635317,
"eval_accuracy": 0.8281108707620414,
"eval_loss": 0.9486715197563171,
"eval_runtime": 304.225,
"eval_samples_per_second": 133.46,
"eval_steps_per_second": 4.171,
"step": 42000
},
{
"epoch": 5.82670688236907,
"grad_norm": 3.0054922103881836,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.8743,
"step": 42500
},
{
"epoch": 5.82670688236907,
"eval_accuracy": 0.8284311134181968,
"eval_loss": 0.9539070725440979,
"eval_runtime": 304.8553,
"eval_samples_per_second": 133.185,
"eval_steps_per_second": 4.163,
"step": 42500
},
{
"epoch": 5.895256375102824,
"grad_norm": 2.29243540763855,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.8667,
"step": 43000
},
{
"epoch": 5.895256375102824,
"eval_accuracy": 0.8291235685160401,
"eval_loss": 0.9470139145851135,
"eval_runtime": 298.7685,
"eval_samples_per_second": 135.898,
"eval_steps_per_second": 4.247,
"step": 43000
},
{
"epoch": 5.963805867836578,
"grad_norm": 2.5743372440338135,
"learning_rate": 5.65e-05,
"loss": 0.8681,
"step": 43500
},
{
"epoch": 5.963805867836578,
"eval_accuracy": 0.8291158725629887,
"eval_loss": 0.946834921836853,
"eval_runtime": 284.2736,
"eval_samples_per_second": 142.827,
"eval_steps_per_second": 4.464,
"step": 43500
},
{
"epoch": 6.032355360570332,
"grad_norm": 2.33494234085083,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.8594,
"step": 44000
},
{
"epoch": 6.032355360570332,
"eval_accuracy": 0.8301663716691428,
"eval_loss": 0.9472524523735046,
"eval_runtime": 261.9172,
"eval_samples_per_second": 155.018,
"eval_steps_per_second": 4.845,
"step": 44000
},
{
"epoch": 6.100904853304086,
"grad_norm": 2.7616426944732666,
"learning_rate": 5.550000000000001e-05,
"loss": 0.8517,
"step": 44500
},
{
"epoch": 6.100904853304086,
"eval_accuracy": 0.8304027916380742,
"eval_loss": 0.9408496022224426,
"eval_runtime": 257.341,
"eval_samples_per_second": 157.775,
"eval_steps_per_second": 4.931,
"step": 44500
},
{
"epoch": 6.169454346037839,
"grad_norm": 2.6394338607788086,
"learning_rate": 5.500000000000001e-05,
"loss": 0.8453,
"step": 45000
},
{
"epoch": 6.169454346037839,
"eval_accuracy": 0.8302381896975964,
"eval_loss": 0.945652425289154,
"eval_runtime": 257.4862,
"eval_samples_per_second": 157.686,
"eval_steps_per_second": 4.928,
"step": 45000
},
{
"epoch": 6.2380038387715935,
"grad_norm": 2.6004316806793213,
"learning_rate": 5.45e-05,
"loss": 0.8486,
"step": 45500
},
{
"epoch": 6.2380038387715935,
"eval_accuracy": 0.8311095745962099,
"eval_loss": 0.940719485282898,
"eval_runtime": 258.4002,
"eval_samples_per_second": 157.128,
"eval_steps_per_second": 4.911,
"step": 45500
},
{
"epoch": 6.306553331505347,
"grad_norm": 2.722169876098633,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.8469,
"step": 46000
},
{
"epoch": 6.306553331505347,
"eval_accuracy": 0.8303636681664364,
"eval_loss": 0.9365447759628296,
"eval_runtime": 257.5562,
"eval_samples_per_second": 157.643,
"eval_steps_per_second": 4.927,
"step": 46000
},
{
"epoch": 6.375102824239101,
"grad_norm": 2.955397367477417,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.8434,
"step": 46500
},
{
"epoch": 6.375102824239101,
"eval_accuracy": 0.8311330928241353,
"eval_loss": 0.9382375478744507,
"eval_runtime": 258.6675,
"eval_samples_per_second": 156.966,
"eval_steps_per_second": 4.906,
"step": 46500
},
{
"epoch": 6.443652316972854,
"grad_norm": 2.375140428543091,
"learning_rate": 5.300000000000001e-05,
"loss": 0.8343,
"step": 47000
},
{
"epoch": 6.443652316972854,
"eval_accuracy": 0.8315033650181133,
"eval_loss": 0.934901773929596,
"eval_runtime": 257.5199,
"eval_samples_per_second": 157.665,
"eval_steps_per_second": 4.928,
"step": 47000
},
{
"epoch": 6.512201809706609,
"grad_norm": 2.4617624282836914,
"learning_rate": 5.25e-05,
"loss": 0.8312,
"step": 47500
},
{
"epoch": 6.512201809706609,
"eval_accuracy": 0.8317886916287004,
"eval_loss": 0.9246230721473694,
"eval_runtime": 258.5693,
"eval_samples_per_second": 157.026,
"eval_steps_per_second": 4.908,
"step": 47500
},
{
"epoch": 6.580751302440362,
"grad_norm": 2.4794909954071045,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.8365,
"step": 48000
},
{
"epoch": 6.580751302440362,
"eval_accuracy": 0.8332747655954476,
"eval_loss": 0.9223575592041016,
"eval_runtime": 257.532,
"eval_samples_per_second": 157.658,
"eval_steps_per_second": 4.928,
"step": 48000
},
{
"epoch": 6.649300795174116,
"grad_norm": 2.893775224685669,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.8307,
"step": 48500
},
{
"epoch": 6.649300795174116,
"eval_accuracy": 0.8332412930945546,
"eval_loss": 0.9224662184715271,
"eval_runtime": 257.3988,
"eval_samples_per_second": 157.74,
"eval_steps_per_second": 4.93,
"step": 48500
},
{
"epoch": 6.717850287907869,
"grad_norm": 2.4228713512420654,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.838,
"step": 49000
},
{
"epoch": 6.717850287907869,
"eval_accuracy": 0.8337124591782815,
"eval_loss": 0.9226129055023193,
"eval_runtime": 258.4175,
"eval_samples_per_second": 157.118,
"eval_steps_per_second": 4.911,
"step": 49000
},
{
"epoch": 6.786399780641624,
"grad_norm": 2.462571144104004,
"learning_rate": 5.05e-05,
"loss": 0.8355,
"step": 49500
},
{
"epoch": 6.786399780641624,
"eval_accuracy": 0.8337023695286286,
"eval_loss": 0.9333141446113586,
"eval_runtime": 257.7508,
"eval_samples_per_second": 157.524,
"eval_steps_per_second": 4.923,
"step": 49500
},
{
"epoch": 6.854949273375377,
"grad_norm": 2.5558559894561768,
"learning_rate": 5e-05,
"loss": 0.8391,
"step": 50000
},
{
"epoch": 6.854949273375377,
"eval_accuracy": 0.8339626635022332,
"eval_loss": 0.9166584610939026,
"eval_runtime": 258.4946,
"eval_samples_per_second": 157.071,
"eval_steps_per_second": 4.909,
"step": 50000
},
{
"epoch": 6.923498766109131,
"grad_norm": 2.733778953552246,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.834,
"step": 50500
},
{
"epoch": 6.923498766109131,
"eval_accuracy": 0.8344111438621534,
"eval_loss": 0.9235773086547852,
"eval_runtime": 258.6401,
"eval_samples_per_second": 156.983,
"eval_steps_per_second": 4.906,
"step": 50500
},
{
"epoch": 6.9920482588428845,
"grad_norm": 2.802053451538086,
"learning_rate": 4.9e-05,
"loss": 0.8269,
"step": 51000
},
{
"epoch": 6.9920482588428845,
"eval_accuracy": 0.8348735131396657,
"eval_loss": 0.920405924320221,
"eval_runtime": 257.5552,
"eval_samples_per_second": 157.644,
"eval_steps_per_second": 4.927,
"step": 51000
},
{
"epoch": 7.060597751576639,
"grad_norm": 2.666555404663086,
"learning_rate": 4.85e-05,
"loss": 0.8085,
"step": 51500
},
{
"epoch": 7.060597751576639,
"eval_accuracy": 0.8347147667297783,
"eval_loss": 0.9154396057128906,
"eval_runtime": 258.4259,
"eval_samples_per_second": 157.113,
"eval_steps_per_second": 4.91,
"step": 51500
},
{
"epoch": 7.129147244310392,
"grad_norm": 2.4190189838409424,
"learning_rate": 4.8e-05,
"loss": 0.819,
"step": 52000
},
{
"epoch": 7.129147244310392,
"eval_accuracy": 0.8344749377257509,
"eval_loss": 0.9256834983825684,
"eval_runtime": 258.4604,
"eval_samples_per_second": 157.092,
"eval_steps_per_second": 4.91,
"step": 52000
},
{
"epoch": 7.197696737044146,
"grad_norm": 2.8802294731140137,
"learning_rate": 4.75e-05,
"loss": 0.8238,
"step": 52500
},
{
"epoch": 7.197696737044146,
"eval_accuracy": 0.8351059340465287,
"eval_loss": 0.9185708165168762,
"eval_runtime": 258.4374,
"eval_samples_per_second": 157.106,
"eval_steps_per_second": 4.91,
"step": 52500
},
{
"epoch": 7.2662462297779,
"grad_norm": 2.5864334106445312,
"learning_rate": 4.7e-05,
"loss": 0.8065,
"step": 53000
},
{
"epoch": 7.2662462297779,
"eval_accuracy": 0.8358540639956455,
"eval_loss": 0.9095313549041748,
"eval_runtime": 258.6881,
"eval_samples_per_second": 156.953,
"eval_steps_per_second": 4.906,
"step": 53000
},
{
"epoch": 7.334795722511654,
"grad_norm": 3.0811657905578613,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.8199,
"step": 53500
},
{
"epoch": 7.334795722511654,
"eval_accuracy": 0.8364544665607716,
"eval_loss": 0.9065914154052734,
"eval_runtime": 258.6721,
"eval_samples_per_second": 156.963,
"eval_steps_per_second": 4.906,
"step": 53500
},
{
"epoch": 7.403345215245407,
"grad_norm": 2.3102753162384033,
"learning_rate": 4.600000000000001e-05,
"loss": 0.8018,
"step": 54000
},
{
"epoch": 7.403345215245407,
"eval_accuracy": 0.8359817384838574,
"eval_loss": 0.9075337052345276,
"eval_runtime": 257.5046,
"eval_samples_per_second": 157.675,
"eval_steps_per_second": 4.928,
"step": 54000
},
{
"epoch": 7.471894707979161,
"grad_norm": 2.269843578338623,
"learning_rate": 4.55e-05,
"loss": 0.8102,
"step": 54500
},
{
"epoch": 7.471894707979161,
"eval_accuracy": 0.8372314644838865,
"eval_loss": 0.9001559019088745,
"eval_runtime": 258.4508,
"eval_samples_per_second": 157.098,
"eval_steps_per_second": 4.91,
"step": 54500
},
{
"epoch": 7.540444200712915,
"grad_norm": 2.617309093475342,
"learning_rate": 4.5e-05,
"loss": 0.8194,
"step": 55000
},
{
"epoch": 7.540444200712915,
"eval_accuracy": 0.8369146258637555,
"eval_loss": 0.899241030216217,
"eval_runtime": 257.4931,
"eval_samples_per_second": 157.682,
"eval_steps_per_second": 4.928,
"step": 55000
},
{
"epoch": 7.608993693446669,
"grad_norm": 2.4634625911712646,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.8138,
"step": 55500
},
{
"epoch": 7.608993693446669,
"eval_accuracy": 0.8369996392235467,
"eval_loss": 0.9033562541007996,
"eval_runtime": 258.54,
"eval_samples_per_second": 157.043,
"eval_steps_per_second": 4.908,
"step": 55500
},
{
"epoch": 7.677543186180422,
"grad_norm": 2.850604772567749,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.8077,
"step": 56000
},
{
"epoch": 7.677543186180422,
"eval_accuracy": 0.8378181332713783,
"eval_loss": 0.9002473950386047,
"eval_runtime": 257.5581,
"eval_samples_per_second": 157.642,
"eval_steps_per_second": 4.927,
"step": 56000
},
{
"epoch": 7.746092678914176,
"grad_norm": 2.3677656650543213,
"learning_rate": 4.35e-05,
"loss": 0.8119,
"step": 56500
},
{
"epoch": 7.746092678914176,
"eval_accuracy": 0.8382744553424182,
"eval_loss": 0.8992937803268433,
"eval_runtime": 258.5324,
"eval_samples_per_second": 157.048,
"eval_steps_per_second": 4.908,
"step": 56500
},
{
"epoch": 7.81464217164793,
"grad_norm": 2.0961389541625977,
"learning_rate": 4.3e-05,
"loss": 0.8029,
"step": 57000
},
{
"epoch": 7.81464217164793,
"eval_accuracy": 0.8377691746192005,
"eval_loss": 0.8913019895553589,
"eval_runtime": 258.5294,
"eval_samples_per_second": 157.05,
"eval_steps_per_second": 4.909,
"step": 57000
},
{
"epoch": 7.883191664381684,
"grad_norm": 2.424496650695801,
"learning_rate": 4.25e-05,
"loss": 0.802,
"step": 57500
},
{
"epoch": 7.883191664381684,
"eval_accuracy": 0.8387740254965514,
"eval_loss": 0.8913179039955139,
"eval_runtime": 258.6018,
"eval_samples_per_second": 157.006,
"eval_steps_per_second": 4.907,
"step": 57500
},
{
"epoch": 7.951741157115437,
"grad_norm": 2.8273098468780518,
"learning_rate": 4.2e-05,
"loss": 0.7887,
"step": 58000
},
{
"epoch": 7.951741157115437,
"eval_accuracy": 0.8389023572318363,
"eval_loss": 0.8917869329452515,
"eval_runtime": 259.2663,
"eval_samples_per_second": 156.603,
"eval_steps_per_second": 4.895,
"step": 58000
},
{
"epoch": 8.02029064984919,
"grad_norm": 2.5863022804260254,
"learning_rate": 4.15e-05,
"loss": 0.7902,
"step": 58500
},
{
"epoch": 8.02029064984919,
"eval_accuracy": 0.8385434985627828,
"eval_loss": 0.8866747617721558,
"eval_runtime": 257.9905,
"eval_samples_per_second": 157.378,
"eval_steps_per_second": 4.919,
"step": 58500
},
{
"epoch": 8.088840142582946,
"grad_norm": 2.357172727584839,
"learning_rate": 4.1e-05,
"loss": 0.7892,
"step": 59000
},
{
"epoch": 8.088840142582946,
"eval_accuracy": 0.8392591622264518,
"eval_loss": 0.8825114369392395,
"eval_runtime": 258.6746,
"eval_samples_per_second": 156.962,
"eval_steps_per_second": 4.906,
"step": 59000
},
{
"epoch": 8.157389635316699,
"grad_norm": 2.3970017433166504,
"learning_rate": 4.05e-05,
"loss": 0.7928,
"step": 59500
},
{
"epoch": 8.157389635316699,
"eval_accuracy": 0.8398791985985096,
"eval_loss": 0.8858514428138733,
"eval_runtime": 258.8038,
"eval_samples_per_second": 156.883,
"eval_steps_per_second": 4.903,
"step": 59500
},
{
"epoch": 8.225939128050452,
"grad_norm": 2.1816744804382324,
"learning_rate": 4e-05,
"loss": 0.786,
"step": 60000
},
{
"epoch": 8.225939128050452,
"eval_accuracy": 0.8399906549712216,
"eval_loss": 0.8871041536331177,
"eval_runtime": 257.814,
"eval_samples_per_second": 157.486,
"eval_steps_per_second": 4.922,
"step": 60000
},
{
"epoch": 8.294488620784206,
"grad_norm": 2.6891512870788574,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.7838,
"step": 60500
},
{
"epoch": 8.294488620784206,
"eval_accuracy": 0.8402320722657605,
"eval_loss": 0.8783635497093201,
"eval_runtime": 257.6277,
"eval_samples_per_second": 157.6,
"eval_steps_per_second": 4.926,
"step": 60500
},
{
"epoch": 8.36303811351796,
"grad_norm": 2.2459070682525635,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.7857,
"step": 61000
},
{
"epoch": 8.36303811351796,
"eval_accuracy": 0.8401854172212088,
"eval_loss": 0.8782520294189453,
"eval_runtime": 258.8529,
"eval_samples_per_second": 156.854,
"eval_steps_per_second": 4.902,
"step": 61000
},
{
"epoch": 8.431587606251714,
"grad_norm": 2.3516621589660645,
"learning_rate": 3.85e-05,
"loss": 0.7807,
"step": 61500
},
{
"epoch": 8.431587606251714,
"eval_accuracy": 0.8408464558399127,
"eval_loss": 0.8879706263542175,
"eval_runtime": 258.721,
"eval_samples_per_second": 156.934,
"eval_steps_per_second": 4.905,
"step": 61500
},
{
"epoch": 8.500137098985467,
"grad_norm": 2.408498764038086,
"learning_rate": 3.8e-05,
"loss": 0.7869,
"step": 62000
},
{
"epoch": 8.500137098985467,
"eval_accuracy": 0.8411305726347277,
"eval_loss": 0.8754673004150391,
"eval_runtime": 258.7894,
"eval_samples_per_second": 156.892,
"eval_steps_per_second": 4.904,
"step": 62000
},
{
"epoch": 8.56868659171922,
"grad_norm": 2.7398715019226074,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.7768,
"step": 62500
},
{
"epoch": 8.56868659171922,
"eval_accuracy": 0.8415116048695496,
"eval_loss": 0.8766404390335083,
"eval_runtime": 258.7705,
"eval_samples_per_second": 156.903,
"eval_steps_per_second": 4.904,
"step": 62500
},
{
"epoch": 8.637236084452976,
"grad_norm": 2.7151975631713867,
"learning_rate": 3.7e-05,
"loss": 0.7806,
"step": 63000
},
{
"epoch": 8.637236084452976,
"eval_accuracy": 0.8416668834829366,
"eval_loss": 0.8836163282394409,
"eval_runtime": 257.8541,
"eval_samples_per_second": 157.461,
"eval_steps_per_second": 4.921,
"step": 63000
},
{
"epoch": 8.70578557718673,
"grad_norm": 2.50140380859375,
"learning_rate": 3.65e-05,
"loss": 0.7811,
"step": 63500
},
{
"epoch": 8.70578557718673,
"eval_accuracy": 0.8423243126907809,
"eval_loss": 0.8704027533531189,
"eval_runtime": 256.7585,
"eval_samples_per_second": 158.133,
"eval_steps_per_second": 4.942,
"step": 63500
},
{
"epoch": 8.774335069920483,
"grad_norm": 2.9045302867889404,
"learning_rate": 3.6e-05,
"loss": 0.7733,
"step": 64000
},
{
"epoch": 8.774335069920483,
"eval_accuracy": 0.8424119193113302,
"eval_loss": 0.8675287365913391,
"eval_runtime": 258.0076,
"eval_samples_per_second": 157.367,
"eval_steps_per_second": 4.918,
"step": 64000
},
{
"epoch": 8.842884562654236,
"grad_norm": 2.8266477584838867,
"learning_rate": 3.55e-05,
"loss": 0.7728,
"step": 64500
},
{
"epoch": 8.842884562654236,
"eval_accuracy": 0.8425746458723948,
"eval_loss": 0.8766723871231079,
"eval_runtime": 257.1101,
"eval_samples_per_second": 157.917,
"eval_steps_per_second": 4.936,
"step": 64500
},
{
"epoch": 8.911434055387991,
"grad_norm": 2.3678436279296875,
"learning_rate": 3.5e-05,
"loss": 0.7779,
"step": 65000
},
{
"epoch": 8.911434055387991,
"eval_accuracy": 0.842695820921136,
"eval_loss": 0.8741580843925476,
"eval_runtime": 258.4977,
"eval_samples_per_second": 157.069,
"eval_steps_per_second": 4.909,
"step": 65000
},
{
"epoch": 8.979983548121744,
"grad_norm": 2.4681596755981445,
"learning_rate": 3.45e-05,
"loss": 0.7779,
"step": 65500
},
{
"epoch": 8.979983548121744,
"eval_accuracy": 0.8435885693668189,
"eval_loss": 0.8590840697288513,
"eval_runtime": 258.179,
"eval_samples_per_second": 157.263,
"eval_steps_per_second": 4.915,
"step": 65500
},
{
"epoch": 9.048533040855498,
"grad_norm": 2.4200708866119385,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.7704,
"step": 66000
},
{
"epoch": 9.048533040855498,
"eval_accuracy": 0.8437169036285643,
"eval_loss": 0.8766728639602661,
"eval_runtime": 258.0969,
"eval_samples_per_second": 157.313,
"eval_steps_per_second": 4.917,
"step": 66000
},
{
"epoch": 9.117082533589251,
"grad_norm": 2.753324270248413,
"learning_rate": 3.35e-05,
"loss": 0.7695,
"step": 66500
},
{
"epoch": 9.117082533589251,
"eval_accuracy": 0.8435838175840091,
"eval_loss": 0.881564199924469,
"eval_runtime": 259.2788,
"eval_samples_per_second": 156.596,
"eval_steps_per_second": 4.894,
"step": 66500
},
{
"epoch": 9.185632026323006,
"grad_norm": 2.490852117538452,
"learning_rate": 3.3e-05,
"loss": 0.7617,
"step": 67000
},
{
"epoch": 9.185632026323006,
"eval_accuracy": 0.8437435512372087,
"eval_loss": 0.8751281499862671,
"eval_runtime": 256.1107,
"eval_samples_per_second": 158.533,
"eval_steps_per_second": 4.955,
"step": 67000
},
{
"epoch": 9.25418151905676,
"grad_norm": 2.581777334213257,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.7585,
"step": 67500
},
{
"epoch": 9.25418151905676,
"eval_accuracy": 0.8439763030486503,
"eval_loss": 0.8657551407814026,
"eval_runtime": 257.7079,
"eval_samples_per_second": 157.55,
"eval_steps_per_second": 4.924,
"step": 67500
},
{
"epoch": 9.322731011790513,
"grad_norm": 2.997283458709717,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.7657,
"step": 68000
},
{
"epoch": 9.322731011790513,
"eval_accuracy": 0.8446350018812996,
"eval_loss": 0.8639153838157654,
"eval_runtime": 256.5945,
"eval_samples_per_second": 158.234,
"eval_steps_per_second": 4.946,
"step": 68000
},
{
"epoch": 9.391280504524266,
"grad_norm": 2.7763564586639404,
"learning_rate": 3.15e-05,
"loss": 0.759,
"step": 68500
},
{
"epoch": 9.391280504524266,
"eval_accuracy": 0.8448020556501544,
"eval_loss": 0.8533274531364441,
"eval_runtime": 256.963,
"eval_samples_per_second": 158.007,
"eval_steps_per_second": 4.938,
"step": 68500
},
{
"epoch": 9.459829997258021,
"grad_norm": 2.864605665206909,
"learning_rate": 3.1e-05,
"loss": 0.7574,
"step": 69000
},
{
"epoch": 9.459829997258021,
"eval_accuracy": 0.8444638910956694,
"eval_loss": 0.8697899580001831,
"eval_runtime": 255.8551,
"eval_samples_per_second": 158.691,
"eval_steps_per_second": 4.96,
"step": 69000
},
{
"epoch": 9.528379489991774,
"grad_norm": 2.5367231369018555,
"learning_rate": 3.05e-05,
"loss": 0.7529,
"step": 69500
},
{
"epoch": 9.528379489991774,
"eval_accuracy": 0.8451616733099838,
"eval_loss": 0.8582028746604919,
"eval_runtime": 256.6803,
"eval_samples_per_second": 158.181,
"eval_steps_per_second": 4.944,
"step": 69500
},
{
"epoch": 9.596928982725528,
"grad_norm": 2.1710877418518066,
"learning_rate": 3e-05,
"loss": 0.7577,
"step": 70000
},
{
"epoch": 9.596928982725528,
"eval_accuracy": 0.845897620114414,
"eval_loss": 0.8616137504577637,
"eval_runtime": 255.6697,
"eval_samples_per_second": 158.806,
"eval_steps_per_second": 4.963,
"step": 70000
},
{
"epoch": 9.665478475459281,
"grad_norm": 2.400867462158203,
"learning_rate": 2.95e-05,
"loss": 0.7554,
"step": 70500
},
{
"epoch": 9.665478475459281,
"eval_accuracy": 0.8460372178025299,
"eval_loss": 0.8466119766235352,
"eval_runtime": 256.0692,
"eval_samples_per_second": 158.559,
"eval_steps_per_second": 4.956,
"step": 70500
},
{
"epoch": 9.734027968193036,
"grad_norm": 2.6465237140655518,
"learning_rate": 2.9e-05,
"loss": 0.7406,
"step": 71000
},
{
"epoch": 9.734027968193036,
"eval_accuracy": 0.846062622809469,
"eval_loss": 0.8593913316726685,
"eval_runtime": 257.0976,
"eval_samples_per_second": 157.924,
"eval_steps_per_second": 4.936,
"step": 71000
},
{
"epoch": 9.80257746092679,
"grad_norm": 2.72021222114563,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.7543,
"step": 71500
},
{
"epoch": 9.80257746092679,
"eval_accuracy": 0.8463149479101548,
"eval_loss": 0.8515172600746155,
"eval_runtime": 256.7199,
"eval_samples_per_second": 158.157,
"eval_steps_per_second": 4.943,
"step": 71500
},
{
"epoch": 9.871126953660543,
"grad_norm": 2.639139175415039,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.7506,
"step": 72000
},
{
"epoch": 9.871126953660543,
"eval_accuracy": 0.8465278129829884,
"eval_loss": 0.8525589108467102,
"eval_runtime": 258.3778,
"eval_samples_per_second": 157.142,
"eval_steps_per_second": 4.911,
"step": 72000
},
{
"epoch": 9.939676446394296,
"grad_norm": 2.2242422103881836,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.7517,
"step": 72500
},
{
"epoch": 9.939676446394296,
"eval_accuracy": 0.8470482563336492,
"eval_loss": 0.8529332876205444,
"eval_runtime": 257.3598,
"eval_samples_per_second": 157.764,
"eval_steps_per_second": 4.931,
"step": 72500
},
{
"epoch": 10.008225939128051,
"grad_norm": 2.333671808242798,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.7425,
"step": 73000
},
{
"epoch": 10.008225939128051,
"eval_accuracy": 0.8467576970668704,
"eval_loss": 0.8543536067008972,
"eval_runtime": 257.3323,
"eval_samples_per_second": 157.78,
"eval_steps_per_second": 4.931,
"step": 73000
},
{
"epoch": 10.076775431861805,
"grad_norm": 2.4138877391815186,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.7464,
"step": 73500
},
{
"epoch": 10.076775431861805,
"eval_accuracy": 0.8473971965940057,
"eval_loss": 0.8455188870429993,
"eval_runtime": 257.2349,
"eval_samples_per_second": 157.84,
"eval_steps_per_second": 4.933,
"step": 73500
},
{
"epoch": 10.145324924595558,
"grad_norm": 2.812563180923462,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.7412,
"step": 74000
},
{
"epoch": 10.145324924595558,
"eval_accuracy": 0.8475976099587939,
"eval_loss": 0.8453831076622009,
"eval_runtime": 256.9915,
"eval_samples_per_second": 157.99,
"eval_steps_per_second": 4.938,
"step": 74000
},
{
"epoch": 10.213874417329311,
"grad_norm": 2.369260549545288,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.7346,
"step": 74500
},
{
"epoch": 10.213874417329311,
"eval_accuracy": 0.8474954397549382,
"eval_loss": 0.8512648344039917,
"eval_runtime": 258.0335,
"eval_samples_per_second": 157.352,
"eval_steps_per_second": 4.918,
"step": 74500
},
{
"epoch": 10.282423910063066,
"grad_norm": 2.7622134685516357,
"learning_rate": 2.5e-05,
"loss": 0.7424,
"step": 75000
},
{
"epoch": 10.282423910063066,
"eval_accuracy": 0.8481833714959756,
"eval_loss": 0.8343672156333923,
"eval_runtime": 258.2629,
"eval_samples_per_second": 157.212,
"eval_steps_per_second": 4.914,
"step": 75000
},
{
"epoch": 10.35097340279682,
"grad_norm": 2.2065768241882324,
"learning_rate": 2.45e-05,
"loss": 0.7364,
"step": 75500
},
{
"epoch": 10.35097340279682,
"eval_accuracy": 0.8482113108532771,
"eval_loss": 0.8340145945549011,
"eval_runtime": 258.3206,
"eval_samples_per_second": 157.177,
"eval_steps_per_second": 4.913,
"step": 75500
},
{
"epoch": 10.419522895530573,
"grad_norm": 2.186100721359253,
"learning_rate": 2.4e-05,
"loss": 0.7409,
"step": 76000
},
{
"epoch": 10.419522895530573,
"eval_accuracy": 0.8489686223957396,
"eval_loss": 0.8362465500831604,
"eval_runtime": 256.9187,
"eval_samples_per_second": 158.034,
"eval_steps_per_second": 4.939,
"step": 76000
},
{
"epoch": 10.488072388264326,
"grad_norm": 2.706817626953125,
"learning_rate": 2.35e-05,
"loss": 0.7353,
"step": 76500
},
{
"epoch": 10.488072388264326,
"eval_accuracy": 0.8487501900476949,
"eval_loss": 0.8368015289306641,
"eval_runtime": 257.1636,
"eval_samples_per_second": 157.884,
"eval_steps_per_second": 4.935,
"step": 76500
},
{
"epoch": 10.556621880998081,
"grad_norm": 2.523261308670044,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.731,
"step": 77000
},
{
"epoch": 10.556621880998081,
"eval_accuracy": 0.8489030226241834,
"eval_loss": 0.8337299823760986,
"eval_runtime": 256.9915,
"eval_samples_per_second": 157.99,
"eval_steps_per_second": 4.938,
"step": 77000
},
{
"epoch": 10.625171373731835,
"grad_norm": 2.606250286102295,
"learning_rate": 2.25e-05,
"loss": 0.7292,
"step": 77500
},
{
"epoch": 10.625171373731835,
"eval_accuracy": 0.8478736538649552,
"eval_loss": 0.8499141335487366,
"eval_runtime": 258.2456,
"eval_samples_per_second": 157.222,
"eval_steps_per_second": 4.914,
"step": 77500
},
{
"epoch": 10.693720866465588,
"grad_norm": 2.5361220836639404,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.7359,
"step": 78000
},
{
"epoch": 10.693720866465588,
"eval_accuracy": 0.8490765860082904,
"eval_loss": 0.8316646218299866,
"eval_runtime": 258.3157,
"eval_samples_per_second": 157.18,
"eval_steps_per_second": 4.913,
"step": 78000
},
{
"epoch": 10.762270359199341,
"grad_norm": 2.3277316093444824,
"learning_rate": 2.15e-05,
"loss": 0.7284,
"step": 78500
},
{
"epoch": 10.762270359199341,
"eval_accuracy": 0.8495720225312002,
"eval_loss": 0.8365707397460938,
"eval_runtime": 258.2814,
"eval_samples_per_second": 157.201,
"eval_steps_per_second": 4.913,
"step": 78500
},
{
"epoch": 10.830819851933096,
"grad_norm": 2.746189832687378,
"learning_rate": 2.1e-05,
"loss": 0.7316,
"step": 79000
},
{
"epoch": 10.830819851933096,
"eval_accuracy": 0.8500147906280335,
"eval_loss": 0.8251886963844299,
"eval_runtime": 257.4381,
"eval_samples_per_second": 157.716,
"eval_steps_per_second": 4.929,
"step": 79000
},
{
"epoch": 10.89936934466685,
"grad_norm": 2.9917354583740234,
"learning_rate": 2.05e-05,
"loss": 0.7304,
"step": 79500
},
{
"epoch": 10.89936934466685,
"eval_accuracy": 0.8502983624757147,
"eval_loss": 0.8259178996086121,
"eval_runtime": 257.2855,
"eval_samples_per_second": 157.809,
"eval_steps_per_second": 4.932,
"step": 79500
},
{
"epoch": 10.967918837400603,
"grad_norm": 2.275324583053589,
"learning_rate": 2e-05,
"loss": 0.7255,
"step": 80000
},
{
"epoch": 10.967918837400603,
"eval_accuracy": 0.8505023638996737,
"eval_loss": 0.8250493407249451,
"eval_runtime": 257.2933,
"eval_samples_per_second": 157.804,
"eval_steps_per_second": 4.932,
"step": 80000
},
{
"epoch": 11.036468330134356,
"grad_norm": 2.6113440990448,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.7224,
"step": 80500
},
{
"epoch": 11.036468330134356,
"eval_accuracy": 0.8507015443771133,
"eval_loss": 0.8299734592437744,
"eval_runtime": 257.4676,
"eval_samples_per_second": 157.698,
"eval_steps_per_second": 4.929,
"step": 80500
},
{
"epoch": 11.105017822868112,
"grad_norm": 2.6536693572998047,
"learning_rate": 1.9e-05,
"loss": 0.7208,
"step": 81000
},
{
"epoch": 11.105017822868112,
"eval_accuracy": 0.8506463117116102,
"eval_loss": 0.8155694007873535,
"eval_runtime": 256.8592,
"eval_samples_per_second": 158.071,
"eval_steps_per_second": 4.94,
"step": 81000
},
{
"epoch": 11.173567315601865,
"grad_norm": 2.290782928466797,
"learning_rate": 1.85e-05,
"loss": 0.7148,
"step": 81500
},
{
"epoch": 11.173567315601865,
"eval_accuracy": 0.8507982915676063,
"eval_loss": 0.8275089859962463,
"eval_runtime": 257.1589,
"eval_samples_per_second": 157.887,
"eval_steps_per_second": 4.935,
"step": 81500
},
{
"epoch": 11.242116808335618,
"grad_norm": 2.6533780097961426,
"learning_rate": 1.8e-05,
"loss": 0.7193,
"step": 82000
},
{
"epoch": 11.242116808335618,
"eval_accuracy": 0.8510864940426184,
"eval_loss": 0.8217721581459045,
"eval_runtime": 258.1645,
"eval_samples_per_second": 157.272,
"eval_steps_per_second": 4.915,
"step": 82000
},
{
"epoch": 11.310666301069372,
"grad_norm": 2.6084372997283936,
"learning_rate": 1.75e-05,
"loss": 0.7177,
"step": 82500
},
{
"epoch": 11.310666301069372,
"eval_accuracy": 0.8508369557490207,
"eval_loss": 0.8289022445678711,
"eval_runtime": 258.043,
"eval_samples_per_second": 157.346,
"eval_steps_per_second": 4.918,
"step": 82500
},
{
"epoch": 11.379215793803127,
"grad_norm": 2.2717843055725098,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.7211,
"step": 83000
},
{
"epoch": 11.379215793803127,
"eval_accuracy": 0.8514459749572929,
"eval_loss": 0.8198857307434082,
"eval_runtime": 258.3084,
"eval_samples_per_second": 157.184,
"eval_steps_per_second": 4.913,
"step": 83000
},
{
"epoch": 11.44776528653688,
"grad_norm": 2.34387469291687,
"learning_rate": 1.65e-05,
"loss": 0.7093,
"step": 83500
},
{
"epoch": 11.44776528653688,
"eval_accuracy": 0.8511664827348943,
"eval_loss": 0.8272643089294434,
"eval_runtime": 258.1866,
"eval_samples_per_second": 157.258,
"eval_steps_per_second": 4.915,
"step": 83500
},
{
"epoch": 11.516314779270633,
"grad_norm": 2.3854498863220215,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.7154,
"step": 84000
},
{
"epoch": 11.516314779270633,
"eval_accuracy": 0.8518011662252298,
"eval_loss": 0.8211445212364197,
"eval_runtime": 257.2815,
"eval_samples_per_second": 157.812,
"eval_steps_per_second": 4.932,
"step": 84000
},
{
"epoch": 11.584864272004387,
"grad_norm": 2.4457602500915527,
"learning_rate": 1.55e-05,
"loss": 0.7178,
"step": 84500
},
{
"epoch": 11.584864272004387,
"eval_accuracy": 0.8521036808235916,
"eval_loss": 0.8183203339576721,
"eval_runtime": 258.3778,
"eval_samples_per_second": 157.142,
"eval_steps_per_second": 4.911,
"step": 84500
},
{
"epoch": 11.653413764738142,
"grad_norm": 2.5457184314727783,
"learning_rate": 1.5e-05,
"loss": 0.716,
"step": 85000
},
{
"epoch": 11.653413764738142,
"eval_accuracy": 0.8522289156626506,
"eval_loss": 0.8176619410514832,
"eval_runtime": 256.9844,
"eval_samples_per_second": 157.994,
"eval_steps_per_second": 4.938,
"step": 85000
},
{
"epoch": 11.721963257471895,
"grad_norm": 2.855541467666626,
"learning_rate": 1.45e-05,
"loss": 0.7081,
"step": 85500
},
{
"epoch": 11.721963257471895,
"eval_accuracy": 0.8522235192962014,
"eval_loss": 0.8102879524230957,
"eval_runtime": 257.0844,
"eval_samples_per_second": 157.933,
"eval_steps_per_second": 4.936,
"step": 85500
},
{
"epoch": 11.790512750205648,
"grad_norm": 2.44242787361145,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.7112,
"step": 86000
},
{
"epoch": 11.790512750205648,
"eval_accuracy": 0.8531087746062566,
"eval_loss": 0.8167855739593506,
"eval_runtime": 258.2921,
"eval_samples_per_second": 157.194,
"eval_steps_per_second": 4.913,
"step": 86000
},
{
"epoch": 11.859062242939402,
"grad_norm": 2.559410333633423,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.7089,
"step": 86500
},
{
"epoch": 11.859062242939402,
"eval_accuracy": 0.8524401167338927,
"eval_loss": 0.8145312666893005,
"eval_runtime": 258.2263,
"eval_samples_per_second": 157.234,
"eval_steps_per_second": 4.914,
"step": 86500
},
{
"epoch": 11.927611735673157,
"grad_norm": 2.3273496627807617,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.7033,
"step": 87000
},
{
"epoch": 11.927611735673157,
"eval_accuracy": 0.8526681918388697,
"eval_loss": 0.8153809309005737,
"eval_runtime": 257.196,
"eval_samples_per_second": 157.864,
"eval_steps_per_second": 4.934,
"step": 87000
},
{
"epoch": 11.99616122840691,
"grad_norm": 2.354360580444336,
"learning_rate": 1.25e-05,
"loss": 0.7029,
"step": 87500
},
{
"epoch": 11.99616122840691,
"eval_accuracy": 0.8531567384453895,
"eval_loss": 0.8229334354400635,
"eval_runtime": 257.3002,
"eval_samples_per_second": 157.8,
"eval_steps_per_second": 4.932,
"step": 87500
},
{
"epoch": 12.064710721140663,
"grad_norm": 2.4728591442108154,
"learning_rate": 1.2e-05,
"loss": 0.7058,
"step": 88000
},
{
"epoch": 12.064710721140663,
"eval_accuracy": 0.8537915504584094,
"eval_loss": 0.8140564560890198,
"eval_runtime": 258.2835,
"eval_samples_per_second": 157.199,
"eval_steps_per_second": 4.913,
"step": 88000
},
{
"epoch": 12.133260213874417,
"grad_norm": 2.485384702682495,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.7005,
"step": 88500
},
{
"epoch": 12.133260213874417,
"eval_accuracy": 0.8534658761375271,
"eval_loss": 0.8151687383651733,
"eval_runtime": 258.2434,
"eval_samples_per_second": 157.224,
"eval_steps_per_second": 4.914,
"step": 88500
},
{
"epoch": 12.201809706608172,
"grad_norm": 2.530062198638916,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.6992,
"step": 89000
},
{
"epoch": 12.201809706608172,
"eval_accuracy": 0.853826029943314,
"eval_loss": 0.8016021847724915,
"eval_runtime": 258.2461,
"eval_samples_per_second": 157.222,
"eval_steps_per_second": 4.914,
"step": 89000
},
{
"epoch": 12.270359199341925,
"grad_norm": 2.5869436264038086,
"learning_rate": 1.05e-05,
"loss": 0.7008,
"step": 89500
},
{
"epoch": 12.270359199341925,
"eval_accuracy": 0.8535235114498525,
"eval_loss": 0.8112274408340454,
"eval_runtime": 257.5365,
"eval_samples_per_second": 157.655,
"eval_steps_per_second": 4.927,
"step": 89500
},
{
"epoch": 12.338908692075679,
"grad_norm": 2.8641934394836426,
"learning_rate": 1e-05,
"loss": 0.6979,
"step": 90000
},
{
"epoch": 12.338908692075679,
"eval_accuracy": 0.8538098856943305,
"eval_loss": 0.8109295964241028,
"eval_runtime": 258.3955,
"eval_samples_per_second": 157.131,
"eval_steps_per_second": 4.911,
"step": 90000
},
{
"epoch": 12.407458184809432,
"grad_norm": 2.686566114425659,
"learning_rate": 9.5e-06,
"loss": 0.6949,
"step": 90500
},
{
"epoch": 12.407458184809432,
"eval_accuracy": 0.8543918997918827,
"eval_loss": 0.8125308156013489,
"eval_runtime": 258.1617,
"eval_samples_per_second": 157.274,
"eval_steps_per_second": 4.916,
"step": 90500
},
{
"epoch": 12.476007677543187,
"grad_norm": 2.452526569366455,
"learning_rate": 9e-06,
"loss": 0.6946,
"step": 91000
},
{
"epoch": 12.476007677543187,
"eval_accuracy": 0.8538440239723134,
"eval_loss": 0.8097832798957825,
"eval_runtime": 258.2364,
"eval_samples_per_second": 157.228,
"eval_steps_per_second": 4.914,
"step": 91000
},
{
"epoch": 12.54455717027694,
"grad_norm": 2.463740825653076,
"learning_rate": 8.500000000000002e-06,
"loss": 0.6939,
"step": 91500
},
{
"epoch": 12.54455717027694,
"eval_accuracy": 0.854438946024891,
"eval_loss": 0.7999902963638306,
"eval_runtime": 257.2634,
"eval_samples_per_second": 157.823,
"eval_steps_per_second": 4.933,
"step": 91500
},
{
"epoch": 12.613106663010694,
"grad_norm": 2.547820568084717,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6969,
"step": 92000
},
{
"epoch": 12.613106663010694,
"eval_accuracy": 0.8542858590534823,
"eval_loss": 0.8070544004440308,
"eval_runtime": 257.3491,
"eval_samples_per_second": 157.77,
"eval_steps_per_second": 4.931,
"step": 92000
},
{
"epoch": 12.681656155744447,
"grad_norm": 2.2731072902679443,
"learning_rate": 7.5e-06,
"loss": 0.6967,
"step": 92500
},
{
"epoch": 12.681656155744447,
"eval_accuracy": 0.8545510749739543,
"eval_loss": 0.7984638810157776,
"eval_runtime": 258.6739,
"eval_samples_per_second": 156.962,
"eval_steps_per_second": 4.906,
"step": 92500
},
{
"epoch": 12.750205648478202,
"grad_norm": 2.242337226867676,
"learning_rate": 7.000000000000001e-06,
"loss": 0.6944,
"step": 93000
},
{
"epoch": 12.750205648478202,
"eval_accuracy": 0.85512864037364,
"eval_loss": 0.7989787459373474,
"eval_runtime": 258.5499,
"eval_samples_per_second": 157.037,
"eval_steps_per_second": 4.908,
"step": 93000
},
{
"epoch": 12.818755141211955,
"grad_norm": 2.4914486408233643,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.6885,
"step": 93500
},
{
"epoch": 12.818755141211955,
"eval_accuracy": 0.8550956587798725,
"eval_loss": 0.8170965313911438,
"eval_runtime": 258.3692,
"eval_samples_per_second": 157.147,
"eval_steps_per_second": 4.912,
"step": 93500
},
{
"epoch": 12.887304633945709,
"grad_norm": 2.1909425258636475,
"learning_rate": 6e-06,
"loss": 0.6897,
"step": 94000
},
{
"epoch": 12.887304633945709,
"eval_accuracy": 0.8550768677242255,
"eval_loss": 0.8014948964118958,
"eval_runtime": 258.4993,
"eval_samples_per_second": 157.068,
"eval_steps_per_second": 4.909,
"step": 94000
},
{
"epoch": 12.955854126679462,
"grad_norm": 2.7882330417633057,
"learning_rate": 5.500000000000001e-06,
"loss": 0.7027,
"step": 94500
},
{
"epoch": 12.955854126679462,
"eval_accuracy": 0.8551748096354626,
"eval_loss": 0.8074929118156433,
"eval_runtime": 258.4421,
"eval_samples_per_second": 157.103,
"eval_steps_per_second": 4.91,
"step": 94500
},
{
"epoch": 13.024403619413217,
"grad_norm": 2.189662218093872,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 95000
},
{
"epoch": 13.024403619413217,
"eval_accuracy": 0.8554257092266915,
"eval_loss": 0.8118977546691895,
"eval_runtime": 257.8701,
"eval_samples_per_second": 157.451,
"eval_steps_per_second": 4.921,
"step": 95000
},
{
"epoch": 13.09295311214697,
"grad_norm": 2.4796793460845947,
"learning_rate": 4.5e-06,
"loss": 0.697,
"step": 95500
},
{
"epoch": 13.09295311214697,
"eval_accuracy": 0.8558198770391428,
"eval_loss": 0.7951220870018005,
"eval_runtime": 258.6851,
"eval_samples_per_second": 156.955,
"eval_steps_per_second": 4.906,
"step": 95500
},
{
"epoch": 13.161502604880724,
"grad_norm": 2.475494146347046,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6814,
"step": 96000
},
{
"epoch": 13.161502604880724,
"eval_accuracy": 0.8557974835152675,
"eval_loss": 0.7992942929267883,
"eval_runtime": 258.6884,
"eval_samples_per_second": 156.953,
"eval_steps_per_second": 4.906,
"step": 96000
},
{
"epoch": 13.230052097614477,
"grad_norm": 2.662364959716797,
"learning_rate": 3.5000000000000004e-06,
"loss": 0.687,
"step": 96500
},
{
"epoch": 13.230052097614477,
"eval_accuracy": 0.8556396210250248,
"eval_loss": 0.7970269322395325,
"eval_runtime": 257.7318,
"eval_samples_per_second": 157.536,
"eval_steps_per_second": 4.924,
"step": 96500
},
{
"epoch": 13.298601590348232,
"grad_norm": 2.5556256771087646,
"learning_rate": 3e-06,
"loss": 0.6956,
"step": 97000
},
{
"epoch": 13.298601590348232,
"eval_accuracy": 0.8560198209787908,
"eval_loss": 0.7952587008476257,
"eval_runtime": 258.556,
"eval_samples_per_second": 157.034,
"eval_steps_per_second": 4.908,
"step": 97000
},
{
"epoch": 13.367151083081986,
"grad_norm": 2.327164888381958,
"learning_rate": 2.5e-06,
"loss": 0.6821,
"step": 97500
},
{
"epoch": 13.367151083081986,
"eval_accuracy": 0.855291832818916,
"eval_loss": 0.8007811307907104,
"eval_runtime": 257.5846,
"eval_samples_per_second": 157.626,
"eval_steps_per_second": 4.927,
"step": 97500
},
{
"epoch": 13.435700575815739,
"grad_norm": 2.408548593521118,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.6846,
"step": 98000
},
{
"epoch": 13.435700575815739,
"eval_accuracy": 0.8561709926629,
"eval_loss": 0.7897204756736755,
"eval_runtime": 258.6709,
"eval_samples_per_second": 156.964,
"eval_steps_per_second": 4.906,
"step": 98000
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 14,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.262679236758733e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}