{ "best_metric": 0.7897204756736755, "best_model_checkpoint": "./model_fine-tune/glot/mbert/ron-Latn/checkpoint-98000", "epoch": 13.435700575815739, "eval_steps": 500, "global_step": 98000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06854949273375377, "grad_norm": 4.000861644744873, "learning_rate": 9.95e-05, "loss": 1.7568, "step": 500 }, { "epoch": 0.06854949273375377, "eval_accuracy": 0.7095725691837217, "eval_loss": 1.6891406774520874, "eval_runtime": 259.6177, "eval_samples_per_second": 156.391, "eval_steps_per_second": 4.888, "step": 500 }, { "epoch": 0.13709898546750754, "grad_norm": 4.238007545471191, "learning_rate": 9.900000000000001e-05, "loss": 1.5655, "step": 1000 }, { "epoch": 0.13709898546750754, "eval_accuracy": 0.7265132414723929, "eval_loss": 1.5632870197296143, "eval_runtime": 260.4774, "eval_samples_per_second": 155.875, "eval_steps_per_second": 4.872, "step": 1000 }, { "epoch": 0.2056484782012613, "grad_norm": 3.5532774925231934, "learning_rate": 9.850000000000001e-05, "loss": 1.4944, "step": 1500 }, { "epoch": 0.2056484782012613, "eval_accuracy": 0.7360105726108715, "eval_loss": 1.4989327192306519, "eval_runtime": 260.0937, "eval_samples_per_second": 156.105, "eval_steps_per_second": 4.879, "step": 1500 }, { "epoch": 0.2741979709350151, "grad_norm": 3.5770881175994873, "learning_rate": 9.8e-05, "loss": 1.4324, "step": 2000 }, { "epoch": 0.2741979709350151, "eval_accuracy": 0.743094448707599, "eval_loss": 1.4517085552215576, "eval_runtime": 260.1265, "eval_samples_per_second": 156.086, "eval_steps_per_second": 4.878, "step": 2000 }, { "epoch": 0.34274746366876885, "grad_norm": 3.5476059913635254, "learning_rate": 9.75e-05, "loss": 1.3797, "step": 2500 }, { "epoch": 0.34274746366876885, "eval_accuracy": 0.7503641171887161, "eval_loss": 1.4144247770309448, "eval_runtime": 258.9987, "eval_samples_per_second": 156.765, "eval_steps_per_second": 4.9, "step": 2500 }, { "epoch": 0.4112969564025226, "grad_norm": 3.64892578125, "learning_rate": 9.7e-05, "loss": 1.3435, "step": 3000 }, { "epoch": 0.4112969564025226, "eval_accuracy": 0.7549876281769841, "eval_loss": 1.380112648010254, "eval_runtime": 258.9885, "eval_samples_per_second": 156.771, "eval_steps_per_second": 4.9, "step": 3000 }, { "epoch": 0.4798464491362764, "grad_norm": 3.1375253200531006, "learning_rate": 9.65e-05, "loss": 1.3133, "step": 3500 }, { "epoch": 0.4798464491362764, "eval_accuracy": 0.7577493236618825, "eval_loss": 1.374040126800537, "eval_runtime": 259.0671, "eval_samples_per_second": 156.724, "eval_steps_per_second": 4.898, "step": 3500 }, { "epoch": 0.5483959418700302, "grad_norm": 3.1487910747528076, "learning_rate": 9.6e-05, "loss": 1.2891, "step": 4000 }, { "epoch": 0.5483959418700302, "eval_accuracy": 0.76247168256945, "eval_loss": 1.34718918800354, "eval_runtime": 259.0025, "eval_samples_per_second": 156.763, "eval_steps_per_second": 4.9, "step": 4000 }, { "epoch": 0.6169454346037839, "grad_norm": 2.947143316268921, "learning_rate": 9.55e-05, "loss": 1.2764, "step": 4500 }, { "epoch": 0.6169454346037839, "eval_accuracy": 0.7652144022180228, "eval_loss": 1.3174166679382324, "eval_runtime": 258.6684, "eval_samples_per_second": 156.965, "eval_steps_per_second": 4.906, "step": 4500 }, { "epoch": 0.6854949273375377, "grad_norm": 3.274010181427002, "learning_rate": 9.5e-05, "loss": 1.2492, "step": 5000 }, { "epoch": 0.6854949273375377, "eval_accuracy": 0.7671819928468485, "eval_loss": 1.313983678817749, "eval_runtime": 258.6625, "eval_samples_per_second": 156.969, "eval_steps_per_second": 4.906, "step": 5000 }, { "epoch": 0.7540444200712915, "grad_norm": 3.109161138534546, "learning_rate": 9.449999999999999e-05, "loss": 1.2329, "step": 5500 }, { "epoch": 0.7540444200712915, "eval_accuracy": 0.7713562268262496, "eval_loss": 1.2868432998657227, "eval_runtime": 260.0165, "eval_samples_per_second": 156.152, "eval_steps_per_second": 4.88, "step": 5500 }, { "epoch": 0.8225939128050452, "grad_norm": 7.044505596160889, "learning_rate": 9.4e-05, "loss": 1.2232, "step": 6000 }, { "epoch": 0.8225939128050452, "eval_accuracy": 0.7731761649539602, "eval_loss": 1.2668291330337524, "eval_runtime": 258.9434, "eval_samples_per_second": 156.799, "eval_steps_per_second": 4.901, "step": 6000 }, { "epoch": 0.891143405538799, "grad_norm": 3.029754400253296, "learning_rate": 9.350000000000001e-05, "loss": 1.2117, "step": 6500 }, { "epoch": 0.891143405538799, "eval_accuracy": 0.7750832812825318, "eval_loss": 1.2626760005950928, "eval_runtime": 260.4421, "eval_samples_per_second": 155.896, "eval_steps_per_second": 4.872, "step": 6500 }, { "epoch": 0.9596928982725528, "grad_norm": 2.961531639099121, "learning_rate": 9.300000000000001e-05, "loss": 1.1924, "step": 7000 }, { "epoch": 0.9596928982725528, "eval_accuracy": 0.7771516510947648, "eval_loss": 1.2538079023361206, "eval_runtime": 260.0863, "eval_samples_per_second": 156.11, "eval_steps_per_second": 4.879, "step": 7000 }, { "epoch": 1.0282423910063065, "grad_norm": 3.120314121246338, "learning_rate": 9.250000000000001e-05, "loss": 1.1807, "step": 7500 }, { "epoch": 1.0282423910063065, "eval_accuracy": 0.7786684520598107, "eval_loss": 1.247827410697937, "eval_runtime": 258.2089, "eval_samples_per_second": 157.245, "eval_steps_per_second": 4.915, "step": 7500 }, { "epoch": 1.0967918837400603, "grad_norm": 3.0152571201324463, "learning_rate": 9.200000000000001e-05, "loss": 1.1666, "step": 8000 }, { "epoch": 1.0967918837400603, "eval_accuracy": 0.7806228052923374, "eval_loss": 1.2236727476119995, "eval_runtime": 258.1981, "eval_samples_per_second": 157.251, "eval_steps_per_second": 4.915, "step": 8000 }, { "epoch": 1.165341376473814, "grad_norm": 4.560582637786865, "learning_rate": 9.15e-05, "loss": 1.1582, "step": 8500 }, { "epoch": 1.165341376473814, "eval_accuracy": 0.7811331658792164, "eval_loss": 1.2169686555862427, "eval_runtime": 257.413, "eval_samples_per_second": 157.731, "eval_steps_per_second": 4.93, "step": 8500 }, { "epoch": 1.2338908692075679, "grad_norm": 2.940659523010254, "learning_rate": 9.1e-05, "loss": 1.1376, "step": 9000 }, { "epoch": 1.2338908692075679, "eval_accuracy": 0.7844609998371601, "eval_loss": 1.1992230415344238, "eval_runtime": 257.3928, "eval_samples_per_second": 157.743, "eval_steps_per_second": 4.93, "step": 9000 }, { "epoch": 1.3024403619413216, "grad_norm": 3.4134812355041504, "learning_rate": 9.05e-05, "loss": 1.1358, "step": 9500 }, { "epoch": 1.3024403619413216, "eval_accuracy": 0.7851634886822189, "eval_loss": 1.2106844186782837, "eval_runtime": 257.3145, "eval_samples_per_second": 157.791, "eval_steps_per_second": 4.932, "step": 9500 }, { "epoch": 1.3709898546750754, "grad_norm": 2.8850438594818115, "learning_rate": 9e-05, "loss": 1.1281, "step": 10000 }, { "epoch": 1.3709898546750754, "eval_accuracy": 0.7872160784699163, "eval_loss": 1.2045215368270874, "eval_runtime": 257.4649, "eval_samples_per_second": 157.699, "eval_steps_per_second": 4.929, "step": 10000 }, { "epoch": 1.4395393474088292, "grad_norm": 2.752389669418335, "learning_rate": 8.950000000000001e-05, "loss": 1.119, "step": 10500 }, { "epoch": 1.4395393474088292, "eval_accuracy": 0.7882877785798362, "eval_loss": 1.1866884231567383, "eval_runtime": 257.0898, "eval_samples_per_second": 157.929, "eval_steps_per_second": 4.936, "step": 10500 }, { "epoch": 1.508088840142583, "grad_norm": 3.4821131229400635, "learning_rate": 8.900000000000001e-05, "loss": 1.1052, "step": 11000 }, { "epoch": 1.508088840142583, "eval_accuracy": 0.7891213465142315, "eval_loss": 1.177182912826538, "eval_runtime": 257.3303, "eval_samples_per_second": 157.782, "eval_steps_per_second": 4.931, "step": 11000 }, { "epoch": 1.5766383328763367, "grad_norm": 2.641080379486084, "learning_rate": 8.850000000000001e-05, "loss": 1.0969, "step": 11500 }, { "epoch": 1.5766383328763367, "eval_accuracy": 0.790978984380183, "eval_loss": 1.1677839756011963, "eval_runtime": 257.8775, "eval_samples_per_second": 157.447, "eval_steps_per_second": 4.921, "step": 11500 }, { "epoch": 1.6451878256100905, "grad_norm": 2.670703172683716, "learning_rate": 8.800000000000001e-05, "loss": 1.0973, "step": 12000 }, { "epoch": 1.6451878256100905, "eval_accuracy": 0.7914186279008998, "eval_loss": 1.1657401323318481, "eval_runtime": 257.6044, "eval_samples_per_second": 157.614, "eval_steps_per_second": 4.926, "step": 12000 }, { "epoch": 1.7137373183438442, "grad_norm": 4.184514045715332, "learning_rate": 8.75e-05, "loss": 1.0931, "step": 12500 }, { "epoch": 1.7137373183438442, "eval_accuracy": 0.7924260144852202, "eval_loss": 1.1656056642532349, "eval_runtime": 257.7225, "eval_samples_per_second": 157.542, "eval_steps_per_second": 4.924, "step": 12500 }, { "epoch": 1.782286811077598, "grad_norm": 3.4420437812805176, "learning_rate": 8.7e-05, "loss": 1.0882, "step": 13000 }, { "epoch": 1.782286811077598, "eval_accuracy": 0.7937018737958299, "eval_loss": 1.159055471420288, "eval_runtime": 257.6253, "eval_samples_per_second": 157.601, "eval_steps_per_second": 4.926, "step": 13000 }, { "epoch": 1.8508363038113518, "grad_norm": 2.942854642868042, "learning_rate": 8.65e-05, "loss": 1.0802, "step": 13500 }, { "epoch": 1.8508363038113518, "eval_accuracy": 0.7948528675102821, "eval_loss": 1.1472080945968628, "eval_runtime": 257.8341, "eval_samples_per_second": 157.473, "eval_steps_per_second": 4.922, "step": 13500 }, { "epoch": 1.9193857965451055, "grad_norm": 2.382511854171753, "learning_rate": 8.6e-05, "loss": 1.0766, "step": 14000 }, { "epoch": 1.9193857965451055, "eval_accuracy": 0.7966625616136973, "eval_loss": 1.1392544507980347, "eval_runtime": 258.8105, "eval_samples_per_second": 156.879, "eval_steps_per_second": 4.903, "step": 14000 }, { "epoch": 1.9879352892788593, "grad_norm": 2.583773374557495, "learning_rate": 8.55e-05, "loss": 1.0719, "step": 14500 }, { "epoch": 1.9879352892788593, "eval_accuracy": 0.7972140563559621, "eval_loss": 1.1423135995864868, "eval_runtime": 257.6602, "eval_samples_per_second": 157.58, "eval_steps_per_second": 4.925, "step": 14500 }, { "epoch": 2.056484782012613, "grad_norm": 2.788512945175171, "learning_rate": 8.5e-05, "loss": 1.0553, "step": 15000 }, { "epoch": 2.056484782012613, "eval_accuracy": 0.7980805136441053, "eval_loss": 1.128947377204895, "eval_runtime": 257.7272, "eval_samples_per_second": 157.539, "eval_steps_per_second": 4.924, "step": 15000 }, { "epoch": 2.125034274746367, "grad_norm": 2.9311256408691406, "learning_rate": 8.450000000000001e-05, "loss": 1.0451, "step": 15500 }, { "epoch": 2.125034274746367, "eval_accuracy": 0.7988239883829299, "eval_loss": 1.1336219310760498, "eval_runtime": 258.1352, "eval_samples_per_second": 157.29, "eval_steps_per_second": 4.916, "step": 15500 }, { "epoch": 2.1935837674801206, "grad_norm": 3.1457791328430176, "learning_rate": 8.4e-05, "loss": 1.0423, "step": 16000 }, { "epoch": 2.1935837674801206, "eval_accuracy": 0.7996375561822584, "eval_loss": 1.1223351955413818, "eval_runtime": 258.998, "eval_samples_per_second": 156.766, "eval_steps_per_second": 4.9, "step": 16000 }, { "epoch": 2.2621332602138744, "grad_norm": 2.9234752655029297, "learning_rate": 8.35e-05, "loss": 1.0449, "step": 16500 }, { "epoch": 2.2621332602138744, "eval_accuracy": 0.8002099641553145, "eval_loss": 1.1050431728363037, "eval_runtime": 258.0627, "eval_samples_per_second": 157.334, "eval_steps_per_second": 4.917, "step": 16500 }, { "epoch": 2.330682752947628, "grad_norm": 2.9982569217681885, "learning_rate": 8.3e-05, "loss": 1.0312, "step": 17000 }, { "epoch": 2.330682752947628, "eval_accuracy": 0.8017049854116234, "eval_loss": 1.1168311834335327, "eval_runtime": 258.8729, "eval_samples_per_second": 156.841, "eval_steps_per_second": 4.902, "step": 17000 }, { "epoch": 2.399232245681382, "grad_norm": 2.63649845123291, "learning_rate": 8.25e-05, "loss": 1.0291, "step": 17500 }, { "epoch": 2.399232245681382, "eval_accuracy": 0.8025579020578063, "eval_loss": 1.1074473857879639, "eval_runtime": 257.7749, "eval_samples_per_second": 157.51, "eval_steps_per_second": 4.923, "step": 17500 }, { "epoch": 2.4677817384151357, "grad_norm": 3.122042417526245, "learning_rate": 8.2e-05, "loss": 1.0245, "step": 18000 }, { "epoch": 2.4677817384151357, "eval_accuracy": 0.802570536529715, "eval_loss": 1.1076058149337769, "eval_runtime": 258.567, "eval_samples_per_second": 157.027, "eval_steps_per_second": 4.908, "step": 18000 }, { "epoch": 2.5363312311488895, "grad_norm": 3.2931995391845703, "learning_rate": 8.15e-05, "loss": 1.0236, "step": 18500 }, { "epoch": 2.5363312311488895, "eval_accuracy": 0.803701212821904, "eval_loss": 1.092323660850525, "eval_runtime": 257.5769, "eval_samples_per_second": 157.631, "eval_steps_per_second": 4.927, "step": 18500 }, { "epoch": 2.6048807238826432, "grad_norm": 3.31691837310791, "learning_rate": 8.1e-05, "loss": 1.0218, "step": 19000 }, { "epoch": 2.6048807238826432, "eval_accuracy": 0.8040252089042342, "eval_loss": 1.0852642059326172, "eval_runtime": 258.7184, "eval_samples_per_second": 156.935, "eval_steps_per_second": 4.905, "step": 19000 }, { "epoch": 2.673430216616397, "grad_norm": 2.602132558822632, "learning_rate": 8.05e-05, "loss": 1.0101, "step": 19500 }, { "epoch": 2.673430216616397, "eval_accuracy": 0.8049598014763766, "eval_loss": 1.0851576328277588, "eval_runtime": 259.1257, "eval_samples_per_second": 156.688, "eval_steps_per_second": 4.897, "step": 19500 }, { "epoch": 2.741979709350151, "grad_norm": 2.6089420318603516, "learning_rate": 8e-05, "loss": 1.0154, "step": 20000 }, { "epoch": 2.741979709350151, "eval_accuracy": 0.8060563142838197, "eval_loss": 1.0850160121917725, "eval_runtime": 257.7528, "eval_samples_per_second": 157.523, "eval_steps_per_second": 4.923, "step": 20000 }, { "epoch": 2.8105292020839046, "grad_norm": 2.57804536819458, "learning_rate": 7.950000000000001e-05, "loss": 1.0092, "step": 20500 }, { "epoch": 2.8105292020839046, "eval_accuracy": 0.8063966673859484, "eval_loss": 1.064876675605774, "eval_runtime": 257.3862, "eval_samples_per_second": 157.747, "eval_steps_per_second": 4.93, "step": 20500 }, { "epoch": 2.8790786948176583, "grad_norm": 2.919243097305298, "learning_rate": 7.900000000000001e-05, "loss": 0.9962, "step": 21000 }, { "epoch": 2.8790786948176583, "eval_accuracy": 0.8074074812889718, "eval_loss": 1.0758228302001953, "eval_runtime": 257.4397, "eval_samples_per_second": 157.715, "eval_steps_per_second": 4.929, "step": 21000 }, { "epoch": 2.947628187551412, "grad_norm": 2.7142975330352783, "learning_rate": 7.850000000000001e-05, "loss": 0.9949, "step": 21500 }, { "epoch": 2.947628187551412, "eval_accuracy": 0.8072095077707074, "eval_loss": 1.0723259449005127, "eval_runtime": 257.4852, "eval_samples_per_second": 157.687, "eval_steps_per_second": 4.928, "step": 21500 }, { "epoch": 3.016177680285166, "grad_norm": 2.461714267730713, "learning_rate": 7.800000000000001e-05, "loss": 0.9933, "step": 22000 }, { "epoch": 3.016177680285166, "eval_accuracy": 0.8092224160930719, "eval_loss": 1.0564687252044678, "eval_runtime": 262.5555, "eval_samples_per_second": 154.642, "eval_steps_per_second": 4.833, "step": 22000 }, { "epoch": 3.0847271730189196, "grad_norm": 3.128793716430664, "learning_rate": 7.75e-05, "loss": 0.9751, "step": 22500 }, { "epoch": 3.0847271730189196, "eval_accuracy": 0.8092534032416199, "eval_loss": 1.0642082691192627, "eval_runtime": 263.2784, "eval_samples_per_second": 154.217, "eval_steps_per_second": 4.82, "step": 22500 }, { "epoch": 3.1532766657526734, "grad_norm": 2.560393810272217, "learning_rate": 7.7e-05, "loss": 0.9792, "step": 23000 }, { "epoch": 3.1532766657526734, "eval_accuracy": 0.8098133712563933, "eval_loss": 1.0663542747497559, "eval_runtime": 279.6936, "eval_samples_per_second": 145.166, "eval_steps_per_second": 4.537, "step": 23000 }, { "epoch": 3.221826158486427, "grad_norm": 2.993088483810425, "learning_rate": 7.65e-05, "loss": 0.9671, "step": 23500 }, { "epoch": 3.221826158486427, "eval_accuracy": 0.8106374773041439, "eval_loss": 1.0509235858917236, "eval_runtime": 302.0862, "eval_samples_per_second": 134.405, "eval_steps_per_second": 4.201, "step": 23500 }, { "epoch": 3.290375651220181, "grad_norm": 2.796325445175171, "learning_rate": 7.6e-05, "loss": 0.9667, "step": 24000 }, { "epoch": 3.290375651220181, "eval_accuracy": 0.8117766899143959, "eval_loss": 1.0434768199920654, "eval_runtime": 302.1312, "eval_samples_per_second": 134.385, "eval_steps_per_second": 4.2, "step": 24000 }, { "epoch": 3.3589251439539347, "grad_norm": 3.067168712615967, "learning_rate": 7.55e-05, "loss": 0.9676, "step": 24500 }, { "epoch": 3.3589251439539347, "eval_accuracy": 0.811503557227325, "eval_loss": 1.0434749126434326, "eval_runtime": 303.0455, "eval_samples_per_second": 133.98, "eval_steps_per_second": 4.187, "step": 24500 }, { "epoch": 3.4274746366876885, "grad_norm": 6.064915657043457, "learning_rate": 7.500000000000001e-05, "loss": 0.9659, "step": 25000 }, { "epoch": 3.4274746366876885, "eval_accuracy": 0.8128527567965343, "eval_loss": 1.0416052341461182, "eval_runtime": 301.3987, "eval_samples_per_second": 134.712, "eval_steps_per_second": 4.21, "step": 25000 }, { "epoch": 3.4960241294214423, "grad_norm": 2.412940740585327, "learning_rate": 7.450000000000001e-05, "loss": 0.9534, "step": 25500 }, { "epoch": 3.4960241294214423, "eval_accuracy": 0.8132953171379946, "eval_loss": 1.0385822057724, "eval_runtime": 300.1799, "eval_samples_per_second": 135.259, "eval_steps_per_second": 4.227, "step": 25500 }, { "epoch": 3.564573622155196, "grad_norm": 2.9293601512908936, "learning_rate": 7.4e-05, "loss": 0.9579, "step": 26000 }, { "epoch": 3.564573622155196, "eval_accuracy": 0.8135014968664439, "eval_loss": 1.0341033935546875, "eval_runtime": 302.1905, "eval_samples_per_second": 134.359, "eval_steps_per_second": 4.199, "step": 26000 }, { "epoch": 3.63312311488895, "grad_norm": 2.3857290744781494, "learning_rate": 7.35e-05, "loss": 0.9562, "step": 26500 }, { "epoch": 3.63312311488895, "eval_accuracy": 0.8136511493390661, "eval_loss": 1.0400645732879639, "eval_runtime": 300.9192, "eval_samples_per_second": 134.927, "eval_steps_per_second": 4.217, "step": 26500 }, { "epoch": 3.7016726076227036, "grad_norm": 2.8844683170318604, "learning_rate": 7.3e-05, "loss": 0.9581, "step": 27000 }, { "epoch": 3.7016726076227036, "eval_accuracy": 0.814749558109949, "eval_loss": 1.0379000902175903, "eval_runtime": 300.8792, "eval_samples_per_second": 134.945, "eval_steps_per_second": 4.218, "step": 27000 }, { "epoch": 3.7702221003564573, "grad_norm": 3.2288286685943604, "learning_rate": 7.25e-05, "loss": 0.9524, "step": 27500 }, { "epoch": 3.7702221003564573, "eval_accuracy": 0.8149326212786671, "eval_loss": 1.0268869400024414, "eval_runtime": 303.3311, "eval_samples_per_second": 133.854, "eval_steps_per_second": 4.184, "step": 27500 }, { "epoch": 3.838771593090211, "grad_norm": 2.84405255317688, "learning_rate": 7.2e-05, "loss": 0.9366, "step": 28000 }, { "epoch": 3.838771593090211, "eval_accuracy": 0.8165028910386263, "eval_loss": 1.0258753299713135, "eval_runtime": 301.0226, "eval_samples_per_second": 134.88, "eval_steps_per_second": 4.216, "step": 28000 }, { "epoch": 3.907321085823965, "grad_norm": 2.78871488571167, "learning_rate": 7.15e-05, "loss": 0.9489, "step": 28500 }, { "epoch": 3.907321085823965, "eval_accuracy": 0.8153732124964858, "eval_loss": 1.0232901573181152, "eval_runtime": 303.0362, "eval_samples_per_second": 133.984, "eval_steps_per_second": 4.188, "step": 28500 }, { "epoch": 3.9758705785577186, "grad_norm": 2.6128122806549072, "learning_rate": 7.1e-05, "loss": 0.9372, "step": 29000 }, { "epoch": 3.9758705785577186, "eval_accuracy": 0.8167775740472835, "eval_loss": 1.0158660411834717, "eval_runtime": 305.0716, "eval_samples_per_second": 133.09, "eval_steps_per_second": 4.16, "step": 29000 }, { "epoch": 4.044420071291473, "grad_norm": 2.4649457931518555, "learning_rate": 7.05e-05, "loss": 0.9389, "step": 29500 }, { "epoch": 4.044420071291473, "eval_accuracy": 0.8169951094301297, "eval_loss": 1.0179492235183716, "eval_runtime": 302.9032, "eval_samples_per_second": 134.043, "eval_steps_per_second": 4.189, "step": 29500 }, { "epoch": 4.112969564025226, "grad_norm": 2.637385845184326, "learning_rate": 7e-05, "loss": 0.9224, "step": 30000 }, { "epoch": 4.112969564025226, "eval_accuracy": 0.8178961830392573, "eval_loss": 1.0067319869995117, "eval_runtime": 300.9419, "eval_samples_per_second": 134.916, "eval_steps_per_second": 4.217, "step": 30000 }, { "epoch": 4.18151905675898, "grad_norm": 2.863875389099121, "learning_rate": 6.95e-05, "loss": 0.9205, "step": 30500 }, { "epoch": 4.18151905675898, "eval_accuracy": 0.8184249147223241, "eval_loss": 1.0113714933395386, "eval_runtime": 301.5875, "eval_samples_per_second": 134.628, "eval_steps_per_second": 4.208, "step": 30500 }, { "epoch": 4.250068549492734, "grad_norm": 2.4322681427001953, "learning_rate": 6.9e-05, "loss": 0.9247, "step": 31000 }, { "epoch": 4.250068549492734, "eval_accuracy": 0.8186936629410247, "eval_loss": 1.003678798675537, "eval_runtime": 303.1297, "eval_samples_per_second": 133.943, "eval_steps_per_second": 4.186, "step": 31000 }, { "epoch": 4.318618042226488, "grad_norm": 2.998030424118042, "learning_rate": 6.850000000000001e-05, "loss": 0.9178, "step": 31500 }, { "epoch": 4.318618042226488, "eval_accuracy": 0.8182408058742606, "eval_loss": 1.0021617412567139, "eval_runtime": 302.4668, "eval_samples_per_second": 134.236, "eval_steps_per_second": 4.196, "step": 31500 }, { "epoch": 4.387167534960241, "grad_norm": 2.5294859409332275, "learning_rate": 6.800000000000001e-05, "loss": 0.9176, "step": 32000 }, { "epoch": 4.387167534960241, "eval_accuracy": 0.8199272207420885, "eval_loss": 1.0028120279312134, "eval_runtime": 301.3201, "eval_samples_per_second": 134.747, "eval_steps_per_second": 4.211, "step": 32000 }, { "epoch": 4.4557170276939955, "grad_norm": 4.368305206298828, "learning_rate": 6.750000000000001e-05, "loss": 0.9187, "step": 32500 }, { "epoch": 4.4557170276939955, "eval_accuracy": 0.8202180503701034, "eval_loss": 1.0021815299987793, "eval_runtime": 301.1897, "eval_samples_per_second": 134.805, "eval_steps_per_second": 4.213, "step": 32500 }, { "epoch": 4.524266520427749, "grad_norm": 3.3230433464050293, "learning_rate": 6.7e-05, "loss": 0.9169, "step": 33000 }, { "epoch": 4.524266520427749, "eval_accuracy": 0.8205231724765897, "eval_loss": 0.9979987740516663, "eval_runtime": 300.7001, "eval_samples_per_second": 135.025, "eval_steps_per_second": 4.22, "step": 33000 }, { "epoch": 4.592816013161503, "grad_norm": 2.592043876647949, "learning_rate": 6.65e-05, "loss": 0.9125, "step": 33500 }, { "epoch": 4.592816013161503, "eval_accuracy": 0.8206935126919565, "eval_loss": 0.9938598871231079, "eval_runtime": 301.9997, "eval_samples_per_second": 134.444, "eval_steps_per_second": 4.202, "step": 33500 }, { "epoch": 4.661365505895256, "grad_norm": 2.446427345275879, "learning_rate": 6.6e-05, "loss": 0.9146, "step": 34000 }, { "epoch": 4.661365505895256, "eval_accuracy": 0.8216566473447208, "eval_loss": 0.9849461913108826, "eval_runtime": 302.9739, "eval_samples_per_second": 134.012, "eval_steps_per_second": 4.188, "step": 34000 }, { "epoch": 4.72991499862901, "grad_norm": 2.884946346282959, "learning_rate": 6.55e-05, "loss": 0.9018, "step": 34500 }, { "epoch": 4.72991499862901, "eval_accuracy": 0.8217915724349476, "eval_loss": 1.0003894567489624, "eval_runtime": 300.4012, "eval_samples_per_second": 135.159, "eval_steps_per_second": 4.224, "step": 34500 }, { "epoch": 4.798464491362764, "grad_norm": 2.8886282444000244, "learning_rate": 6.500000000000001e-05, "loss": 0.9014, "step": 35000 }, { "epoch": 4.798464491362764, "eval_accuracy": 0.8224961720866916, "eval_loss": 0.9889456629753113, "eval_runtime": 301.0802, "eval_samples_per_second": 134.854, "eval_steps_per_second": 4.215, "step": 35000 }, { "epoch": 4.867013984096518, "grad_norm": 2.473068952560425, "learning_rate": 6.450000000000001e-05, "loss": 0.8919, "step": 35500 }, { "epoch": 4.867013984096518, "eval_accuracy": 0.822945545786959, "eval_loss": 0.9848706722259521, "eval_runtime": 303.9067, "eval_samples_per_second": 133.6, "eval_steps_per_second": 4.176, "step": 35500 }, { "epoch": 4.935563476830271, "grad_norm": 3.0716209411621094, "learning_rate": 6.400000000000001e-05, "loss": 0.8993, "step": 36000 }, { "epoch": 4.935563476830271, "eval_accuracy": 0.8222583681418539, "eval_loss": 0.9929753541946411, "eval_runtime": 299.5059, "eval_samples_per_second": 135.563, "eval_steps_per_second": 4.237, "step": 36000 }, { "epoch": 5.004112969564026, "grad_norm": 2.3323957920074463, "learning_rate": 6.35e-05, "loss": 0.9009, "step": 36500 }, { "epoch": 5.004112969564026, "eval_accuracy": 0.8241444257225273, "eval_loss": 0.9798668622970581, "eval_runtime": 299.3852, "eval_samples_per_second": 135.618, "eval_steps_per_second": 4.239, "step": 36500 }, { "epoch": 5.072662462297779, "grad_norm": 2.7152209281921387, "learning_rate": 6.3e-05, "loss": 0.8843, "step": 37000 }, { "epoch": 5.072662462297779, "eval_accuracy": 0.8234012575934279, "eval_loss": 0.9811968803405762, "eval_runtime": 300.6715, "eval_samples_per_second": 135.038, "eval_steps_per_second": 4.221, "step": 37000 }, { "epoch": 5.141211955031533, "grad_norm": 2.526486396789551, "learning_rate": 6.25e-05, "loss": 0.8846, "step": 37500 }, { "epoch": 5.141211955031533, "eval_accuracy": 0.8247555724795991, "eval_loss": 0.9730820655822754, "eval_runtime": 299.291, "eval_samples_per_second": 135.661, "eval_steps_per_second": 4.24, "step": 37500 }, { "epoch": 5.2097614477652865, "grad_norm": 2.5805063247680664, "learning_rate": 6.2e-05, "loss": 0.8807, "step": 38000 }, { "epoch": 5.2097614477652865, "eval_accuracy": 0.8250464067024924, "eval_loss": 0.9684708118438721, "eval_runtime": 302.4457, "eval_samples_per_second": 134.246, "eval_steps_per_second": 4.196, "step": 38000 }, { "epoch": 5.278310940499041, "grad_norm": 2.559605360031128, "learning_rate": 6.15e-05, "loss": 0.8802, "step": 38500 }, { "epoch": 5.278310940499041, "eval_accuracy": 0.8254996987535631, "eval_loss": 0.973818838596344, "eval_runtime": 300.146, "eval_samples_per_second": 135.274, "eval_steps_per_second": 4.228, "step": 38500 }, { "epoch": 5.346860433232794, "grad_norm": 2.1615304946899414, "learning_rate": 6.1e-05, "loss": 0.8789, "step": 39000 }, { "epoch": 5.346860433232794, "eval_accuracy": 0.8254549864960571, "eval_loss": 0.9578101634979248, "eval_runtime": 302.5653, "eval_samples_per_second": 134.193, "eval_steps_per_second": 4.194, "step": 39000 }, { "epoch": 5.415409925966548, "grad_norm": 2.2763609886169434, "learning_rate": 6.05e-05, "loss": 0.8843, "step": 39500 }, { "epoch": 5.415409925966548, "eval_accuracy": 0.8262787384248012, "eval_loss": 0.9698151350021362, "eval_runtime": 301.7649, "eval_samples_per_second": 134.548, "eval_steps_per_second": 4.205, "step": 39500 }, { "epoch": 5.483959418700302, "grad_norm": 2.3774330615997314, "learning_rate": 6e-05, "loss": 0.8714, "step": 40000 }, { "epoch": 5.483959418700302, "eval_accuracy": 0.8263406985859876, "eval_loss": 0.9681651592254639, "eval_runtime": 302.4298, "eval_samples_per_second": 134.253, "eval_steps_per_second": 4.196, "step": 40000 }, { "epoch": 5.552508911434055, "grad_norm": 2.3430614471435547, "learning_rate": 5.95e-05, "loss": 0.8676, "step": 40500 }, { "epoch": 5.552508911434055, "eval_accuracy": 0.8263813882499345, "eval_loss": 0.9515417814254761, "eval_runtime": 301.876, "eval_samples_per_second": 134.499, "eval_steps_per_second": 4.204, "step": 40500 }, { "epoch": 5.621058404167809, "grad_norm": 2.3059141635894775, "learning_rate": 5.9e-05, "loss": 0.8721, "step": 41000 }, { "epoch": 5.621058404167809, "eval_accuracy": 0.8275216377261674, "eval_loss": 0.962243914604187, "eval_runtime": 302.2447, "eval_samples_per_second": 134.335, "eval_steps_per_second": 4.199, "step": 41000 }, { "epoch": 5.689607896901563, "grad_norm": 2.462218999862671, "learning_rate": 5.85e-05, "loss": 0.8699, "step": 41500 }, { "epoch": 5.689607896901563, "eval_accuracy": 0.8283203266406056, "eval_loss": 0.9477165341377258, "eval_runtime": 302.7485, "eval_samples_per_second": 134.111, "eval_steps_per_second": 4.192, "step": 41500 }, { "epoch": 5.758157389635317, "grad_norm": 3.0347349643707275, "learning_rate": 5.8e-05, "loss": 0.8634, "step": 42000 }, { "epoch": 5.758157389635317, "eval_accuracy": 0.8281108707620414, "eval_loss": 0.9486715197563171, "eval_runtime": 304.225, "eval_samples_per_second": 133.46, "eval_steps_per_second": 4.171, "step": 42000 }, { "epoch": 5.82670688236907, "grad_norm": 3.0054922103881836, "learning_rate": 5.7499999999999995e-05, "loss": 0.8743, "step": 42500 }, { "epoch": 5.82670688236907, "eval_accuracy": 0.8284311134181968, "eval_loss": 0.9539070725440979, "eval_runtime": 304.8553, "eval_samples_per_second": 133.185, "eval_steps_per_second": 4.163, "step": 42500 }, { "epoch": 5.895256375102824, "grad_norm": 2.29243540763855, "learning_rate": 5.6999999999999996e-05, "loss": 0.8667, "step": 43000 }, { "epoch": 5.895256375102824, "eval_accuracy": 0.8291235685160401, "eval_loss": 0.9470139145851135, "eval_runtime": 298.7685, "eval_samples_per_second": 135.898, "eval_steps_per_second": 4.247, "step": 43000 }, { "epoch": 5.963805867836578, "grad_norm": 2.5743372440338135, "learning_rate": 5.65e-05, "loss": 0.8681, "step": 43500 }, { "epoch": 5.963805867836578, "eval_accuracy": 0.8291158725629887, "eval_loss": 0.946834921836853, "eval_runtime": 284.2736, "eval_samples_per_second": 142.827, "eval_steps_per_second": 4.464, "step": 43500 }, { "epoch": 6.032355360570332, "grad_norm": 2.33494234085083, "learning_rate": 5.6000000000000006e-05, "loss": 0.8594, "step": 44000 }, { "epoch": 6.032355360570332, "eval_accuracy": 0.8301663716691428, "eval_loss": 0.9472524523735046, "eval_runtime": 261.9172, "eval_samples_per_second": 155.018, "eval_steps_per_second": 4.845, "step": 44000 }, { "epoch": 6.100904853304086, "grad_norm": 2.7616426944732666, "learning_rate": 5.550000000000001e-05, "loss": 0.8517, "step": 44500 }, { "epoch": 6.100904853304086, "eval_accuracy": 0.8304027916380742, "eval_loss": 0.9408496022224426, "eval_runtime": 257.341, "eval_samples_per_second": 157.775, "eval_steps_per_second": 4.931, "step": 44500 }, { "epoch": 6.169454346037839, "grad_norm": 2.6394338607788086, "learning_rate": 5.500000000000001e-05, "loss": 0.8453, "step": 45000 }, { "epoch": 6.169454346037839, "eval_accuracy": 0.8302381896975964, "eval_loss": 0.945652425289154, "eval_runtime": 257.4862, "eval_samples_per_second": 157.686, "eval_steps_per_second": 4.928, "step": 45000 }, { "epoch": 6.2380038387715935, "grad_norm": 2.6004316806793213, "learning_rate": 5.45e-05, "loss": 0.8486, "step": 45500 }, { "epoch": 6.2380038387715935, "eval_accuracy": 0.8311095745962099, "eval_loss": 0.940719485282898, "eval_runtime": 258.4002, "eval_samples_per_second": 157.128, "eval_steps_per_second": 4.911, "step": 45500 }, { "epoch": 6.306553331505347, "grad_norm": 2.722169876098633, "learning_rate": 5.4000000000000005e-05, "loss": 0.8469, "step": 46000 }, { "epoch": 6.306553331505347, "eval_accuracy": 0.8303636681664364, "eval_loss": 0.9365447759628296, "eval_runtime": 257.5562, "eval_samples_per_second": 157.643, "eval_steps_per_second": 4.927, "step": 46000 }, { "epoch": 6.375102824239101, "grad_norm": 2.955397367477417, "learning_rate": 5.3500000000000006e-05, "loss": 0.8434, "step": 46500 }, { "epoch": 6.375102824239101, "eval_accuracy": 0.8311330928241353, "eval_loss": 0.9382375478744507, "eval_runtime": 258.6675, "eval_samples_per_second": 156.966, "eval_steps_per_second": 4.906, "step": 46500 }, { "epoch": 6.443652316972854, "grad_norm": 2.375140428543091, "learning_rate": 5.300000000000001e-05, "loss": 0.8343, "step": 47000 }, { "epoch": 6.443652316972854, "eval_accuracy": 0.8315033650181133, "eval_loss": 0.934901773929596, "eval_runtime": 257.5199, "eval_samples_per_second": 157.665, "eval_steps_per_second": 4.928, "step": 47000 }, { "epoch": 6.512201809706609, "grad_norm": 2.4617624282836914, "learning_rate": 5.25e-05, "loss": 0.8312, "step": 47500 }, { "epoch": 6.512201809706609, "eval_accuracy": 0.8317886916287004, "eval_loss": 0.9246230721473694, "eval_runtime": 258.5693, "eval_samples_per_second": 157.026, "eval_steps_per_second": 4.908, "step": 47500 }, { "epoch": 6.580751302440362, "grad_norm": 2.4794909954071045, "learning_rate": 5.2000000000000004e-05, "loss": 0.8365, "step": 48000 }, { "epoch": 6.580751302440362, "eval_accuracy": 0.8332747655954476, "eval_loss": 0.9223575592041016, "eval_runtime": 257.532, "eval_samples_per_second": 157.658, "eval_steps_per_second": 4.928, "step": 48000 }, { "epoch": 6.649300795174116, "grad_norm": 2.893775224685669, "learning_rate": 5.1500000000000005e-05, "loss": 0.8307, "step": 48500 }, { "epoch": 6.649300795174116, "eval_accuracy": 0.8332412930945546, "eval_loss": 0.9224662184715271, "eval_runtime": 257.3988, "eval_samples_per_second": 157.74, "eval_steps_per_second": 4.93, "step": 48500 }, { "epoch": 6.717850287907869, "grad_norm": 2.4228713512420654, "learning_rate": 5.1000000000000006e-05, "loss": 0.838, "step": 49000 }, { "epoch": 6.717850287907869, "eval_accuracy": 0.8337124591782815, "eval_loss": 0.9226129055023193, "eval_runtime": 258.4175, "eval_samples_per_second": 157.118, "eval_steps_per_second": 4.911, "step": 49000 }, { "epoch": 6.786399780641624, "grad_norm": 2.462571144104004, "learning_rate": 5.05e-05, "loss": 0.8355, "step": 49500 }, { "epoch": 6.786399780641624, "eval_accuracy": 0.8337023695286286, "eval_loss": 0.9333141446113586, "eval_runtime": 257.7508, "eval_samples_per_second": 157.524, "eval_steps_per_second": 4.923, "step": 49500 }, { "epoch": 6.854949273375377, "grad_norm": 2.5558559894561768, "learning_rate": 5e-05, "loss": 0.8391, "step": 50000 }, { "epoch": 6.854949273375377, "eval_accuracy": 0.8339626635022332, "eval_loss": 0.9166584610939026, "eval_runtime": 258.4946, "eval_samples_per_second": 157.071, "eval_steps_per_second": 4.909, "step": 50000 }, { "epoch": 6.923498766109131, "grad_norm": 2.733778953552246, "learning_rate": 4.9500000000000004e-05, "loss": 0.834, "step": 50500 }, { "epoch": 6.923498766109131, "eval_accuracy": 0.8344111438621534, "eval_loss": 0.9235773086547852, "eval_runtime": 258.6401, "eval_samples_per_second": 156.983, "eval_steps_per_second": 4.906, "step": 50500 }, { "epoch": 6.9920482588428845, "grad_norm": 2.802053451538086, "learning_rate": 4.9e-05, "loss": 0.8269, "step": 51000 }, { "epoch": 6.9920482588428845, "eval_accuracy": 0.8348735131396657, "eval_loss": 0.920405924320221, "eval_runtime": 257.5552, "eval_samples_per_second": 157.644, "eval_steps_per_second": 4.927, "step": 51000 }, { "epoch": 7.060597751576639, "grad_norm": 2.666555404663086, "learning_rate": 4.85e-05, "loss": 0.8085, "step": 51500 }, { "epoch": 7.060597751576639, "eval_accuracy": 0.8347147667297783, "eval_loss": 0.9154396057128906, "eval_runtime": 258.4259, "eval_samples_per_second": 157.113, "eval_steps_per_second": 4.91, "step": 51500 }, { "epoch": 7.129147244310392, "grad_norm": 2.4190189838409424, "learning_rate": 4.8e-05, "loss": 0.819, "step": 52000 }, { "epoch": 7.129147244310392, "eval_accuracy": 0.8344749377257509, "eval_loss": 0.9256834983825684, "eval_runtime": 258.4604, "eval_samples_per_second": 157.092, "eval_steps_per_second": 4.91, "step": 52000 }, { "epoch": 7.197696737044146, "grad_norm": 2.8802294731140137, "learning_rate": 4.75e-05, "loss": 0.8238, "step": 52500 }, { "epoch": 7.197696737044146, "eval_accuracy": 0.8351059340465287, "eval_loss": 0.9185708165168762, "eval_runtime": 258.4374, "eval_samples_per_second": 157.106, "eval_steps_per_second": 4.91, "step": 52500 }, { "epoch": 7.2662462297779, "grad_norm": 2.5864334106445312, "learning_rate": 4.7e-05, "loss": 0.8065, "step": 53000 }, { "epoch": 7.2662462297779, "eval_accuracy": 0.8358540639956455, "eval_loss": 0.9095313549041748, "eval_runtime": 258.6881, "eval_samples_per_second": 156.953, "eval_steps_per_second": 4.906, "step": 53000 }, { "epoch": 7.334795722511654, "grad_norm": 3.0811657905578613, "learning_rate": 4.6500000000000005e-05, "loss": 0.8199, "step": 53500 }, { "epoch": 7.334795722511654, "eval_accuracy": 0.8364544665607716, "eval_loss": 0.9065914154052734, "eval_runtime": 258.6721, "eval_samples_per_second": 156.963, "eval_steps_per_second": 4.906, "step": 53500 }, { "epoch": 7.403345215245407, "grad_norm": 2.3102753162384033, "learning_rate": 4.600000000000001e-05, "loss": 0.8018, "step": 54000 }, { "epoch": 7.403345215245407, "eval_accuracy": 0.8359817384838574, "eval_loss": 0.9075337052345276, "eval_runtime": 257.5046, "eval_samples_per_second": 157.675, "eval_steps_per_second": 4.928, "step": 54000 }, { "epoch": 7.471894707979161, "grad_norm": 2.269843578338623, "learning_rate": 4.55e-05, "loss": 0.8102, "step": 54500 }, { "epoch": 7.471894707979161, "eval_accuracy": 0.8372314644838865, "eval_loss": 0.9001559019088745, "eval_runtime": 258.4508, "eval_samples_per_second": 157.098, "eval_steps_per_second": 4.91, "step": 54500 }, { "epoch": 7.540444200712915, "grad_norm": 2.617309093475342, "learning_rate": 4.5e-05, "loss": 0.8194, "step": 55000 }, { "epoch": 7.540444200712915, "eval_accuracy": 0.8369146258637555, "eval_loss": 0.899241030216217, "eval_runtime": 257.4931, "eval_samples_per_second": 157.682, "eval_steps_per_second": 4.928, "step": 55000 }, { "epoch": 7.608993693446669, "grad_norm": 2.4634625911712646, "learning_rate": 4.4500000000000004e-05, "loss": 0.8138, "step": 55500 }, { "epoch": 7.608993693446669, "eval_accuracy": 0.8369996392235467, "eval_loss": 0.9033562541007996, "eval_runtime": 258.54, "eval_samples_per_second": 157.043, "eval_steps_per_second": 4.908, "step": 55500 }, { "epoch": 7.677543186180422, "grad_norm": 2.850604772567749, "learning_rate": 4.4000000000000006e-05, "loss": 0.8077, "step": 56000 }, { "epoch": 7.677543186180422, "eval_accuracy": 0.8378181332713783, "eval_loss": 0.9002473950386047, "eval_runtime": 257.5581, "eval_samples_per_second": 157.642, "eval_steps_per_second": 4.927, "step": 56000 }, { "epoch": 7.746092678914176, "grad_norm": 2.3677656650543213, "learning_rate": 4.35e-05, "loss": 0.8119, "step": 56500 }, { "epoch": 7.746092678914176, "eval_accuracy": 0.8382744553424182, "eval_loss": 0.8992937803268433, "eval_runtime": 258.5324, "eval_samples_per_second": 157.048, "eval_steps_per_second": 4.908, "step": 56500 }, { "epoch": 7.81464217164793, "grad_norm": 2.0961389541625977, "learning_rate": 4.3e-05, "loss": 0.8029, "step": 57000 }, { "epoch": 7.81464217164793, "eval_accuracy": 0.8377691746192005, "eval_loss": 0.8913019895553589, "eval_runtime": 258.5294, "eval_samples_per_second": 157.05, "eval_steps_per_second": 4.909, "step": 57000 }, { "epoch": 7.883191664381684, "grad_norm": 2.424496650695801, "learning_rate": 4.25e-05, "loss": 0.802, "step": 57500 }, { "epoch": 7.883191664381684, "eval_accuracy": 0.8387740254965514, "eval_loss": 0.8913179039955139, "eval_runtime": 258.6018, "eval_samples_per_second": 157.006, "eval_steps_per_second": 4.907, "step": 57500 }, { "epoch": 7.951741157115437, "grad_norm": 2.8273098468780518, "learning_rate": 4.2e-05, "loss": 0.7887, "step": 58000 }, { "epoch": 7.951741157115437, "eval_accuracy": 0.8389023572318363, "eval_loss": 0.8917869329452515, "eval_runtime": 259.2663, "eval_samples_per_second": 156.603, "eval_steps_per_second": 4.895, "step": 58000 }, { "epoch": 8.02029064984919, "grad_norm": 2.5863022804260254, "learning_rate": 4.15e-05, "loss": 0.7902, "step": 58500 }, { "epoch": 8.02029064984919, "eval_accuracy": 0.8385434985627828, "eval_loss": 0.8866747617721558, "eval_runtime": 257.9905, "eval_samples_per_second": 157.378, "eval_steps_per_second": 4.919, "step": 58500 }, { "epoch": 8.088840142582946, "grad_norm": 2.357172727584839, "learning_rate": 4.1e-05, "loss": 0.7892, "step": 59000 }, { "epoch": 8.088840142582946, "eval_accuracy": 0.8392591622264518, "eval_loss": 0.8825114369392395, "eval_runtime": 258.6746, "eval_samples_per_second": 156.962, "eval_steps_per_second": 4.906, "step": 59000 }, { "epoch": 8.157389635316699, "grad_norm": 2.3970017433166504, "learning_rate": 4.05e-05, "loss": 0.7928, "step": 59500 }, { "epoch": 8.157389635316699, "eval_accuracy": 0.8398791985985096, "eval_loss": 0.8858514428138733, "eval_runtime": 258.8038, "eval_samples_per_second": 156.883, "eval_steps_per_second": 4.903, "step": 59500 }, { "epoch": 8.225939128050452, "grad_norm": 2.1816744804382324, "learning_rate": 4e-05, "loss": 0.786, "step": 60000 }, { "epoch": 8.225939128050452, "eval_accuracy": 0.8399906549712216, "eval_loss": 0.8871041536331177, "eval_runtime": 257.814, "eval_samples_per_second": 157.486, "eval_steps_per_second": 4.922, "step": 60000 }, { "epoch": 8.294488620784206, "grad_norm": 2.6891512870788574, "learning_rate": 3.9500000000000005e-05, "loss": 0.7838, "step": 60500 }, { "epoch": 8.294488620784206, "eval_accuracy": 0.8402320722657605, "eval_loss": 0.8783635497093201, "eval_runtime": 257.6277, "eval_samples_per_second": 157.6, "eval_steps_per_second": 4.926, "step": 60500 }, { "epoch": 8.36303811351796, "grad_norm": 2.2459070682525635, "learning_rate": 3.9000000000000006e-05, "loss": 0.7857, "step": 61000 }, { "epoch": 8.36303811351796, "eval_accuracy": 0.8401854172212088, "eval_loss": 0.8782520294189453, "eval_runtime": 258.8529, "eval_samples_per_second": 156.854, "eval_steps_per_second": 4.902, "step": 61000 }, { "epoch": 8.431587606251714, "grad_norm": 2.3516621589660645, "learning_rate": 3.85e-05, "loss": 0.7807, "step": 61500 }, { "epoch": 8.431587606251714, "eval_accuracy": 0.8408464558399127, "eval_loss": 0.8879706263542175, "eval_runtime": 258.721, "eval_samples_per_second": 156.934, "eval_steps_per_second": 4.905, "step": 61500 }, { "epoch": 8.500137098985467, "grad_norm": 2.408498764038086, "learning_rate": 3.8e-05, "loss": 0.7869, "step": 62000 }, { "epoch": 8.500137098985467, "eval_accuracy": 0.8411305726347277, "eval_loss": 0.8754673004150391, "eval_runtime": 258.7894, "eval_samples_per_second": 156.892, "eval_steps_per_second": 4.904, "step": 62000 }, { "epoch": 8.56868659171922, "grad_norm": 2.7398715019226074, "learning_rate": 3.7500000000000003e-05, "loss": 0.7768, "step": 62500 }, { "epoch": 8.56868659171922, "eval_accuracy": 0.8415116048695496, "eval_loss": 0.8766404390335083, "eval_runtime": 258.7705, "eval_samples_per_second": 156.903, "eval_steps_per_second": 4.904, "step": 62500 }, { "epoch": 8.637236084452976, "grad_norm": 2.7151975631713867, "learning_rate": 3.7e-05, "loss": 0.7806, "step": 63000 }, { "epoch": 8.637236084452976, "eval_accuracy": 0.8416668834829366, "eval_loss": 0.8836163282394409, "eval_runtime": 257.8541, "eval_samples_per_second": 157.461, "eval_steps_per_second": 4.921, "step": 63000 }, { "epoch": 8.70578557718673, "grad_norm": 2.50140380859375, "learning_rate": 3.65e-05, "loss": 0.7811, "step": 63500 }, { "epoch": 8.70578557718673, "eval_accuracy": 0.8423243126907809, "eval_loss": 0.8704027533531189, "eval_runtime": 256.7585, "eval_samples_per_second": 158.133, "eval_steps_per_second": 4.942, "step": 63500 }, { "epoch": 8.774335069920483, "grad_norm": 2.9045302867889404, "learning_rate": 3.6e-05, "loss": 0.7733, "step": 64000 }, { "epoch": 8.774335069920483, "eval_accuracy": 0.8424119193113302, "eval_loss": 0.8675287365913391, "eval_runtime": 258.0076, "eval_samples_per_second": 157.367, "eval_steps_per_second": 4.918, "step": 64000 }, { "epoch": 8.842884562654236, "grad_norm": 2.8266477584838867, "learning_rate": 3.55e-05, "loss": 0.7728, "step": 64500 }, { "epoch": 8.842884562654236, "eval_accuracy": 0.8425746458723948, "eval_loss": 0.8766723871231079, "eval_runtime": 257.1101, "eval_samples_per_second": 157.917, "eval_steps_per_second": 4.936, "step": 64500 }, { "epoch": 8.911434055387991, "grad_norm": 2.3678436279296875, "learning_rate": 3.5e-05, "loss": 0.7779, "step": 65000 }, { "epoch": 8.911434055387991, "eval_accuracy": 0.842695820921136, "eval_loss": 0.8741580843925476, "eval_runtime": 258.4977, "eval_samples_per_second": 157.069, "eval_steps_per_second": 4.909, "step": 65000 }, { "epoch": 8.979983548121744, "grad_norm": 2.4681596755981445, "learning_rate": 3.45e-05, "loss": 0.7779, "step": 65500 }, { "epoch": 8.979983548121744, "eval_accuracy": 0.8435885693668189, "eval_loss": 0.8590840697288513, "eval_runtime": 258.179, "eval_samples_per_second": 157.263, "eval_steps_per_second": 4.915, "step": 65500 }, { "epoch": 9.048533040855498, "grad_norm": 2.4200708866119385, "learning_rate": 3.4000000000000007e-05, "loss": 0.7704, "step": 66000 }, { "epoch": 9.048533040855498, "eval_accuracy": 0.8437169036285643, "eval_loss": 0.8766728639602661, "eval_runtime": 258.0969, "eval_samples_per_second": 157.313, "eval_steps_per_second": 4.917, "step": 66000 }, { "epoch": 9.117082533589251, "grad_norm": 2.753324270248413, "learning_rate": 3.35e-05, "loss": 0.7695, "step": 66500 }, { "epoch": 9.117082533589251, "eval_accuracy": 0.8435838175840091, "eval_loss": 0.881564199924469, "eval_runtime": 259.2788, "eval_samples_per_second": 156.596, "eval_steps_per_second": 4.894, "step": 66500 }, { "epoch": 9.185632026323006, "grad_norm": 2.490852117538452, "learning_rate": 3.3e-05, "loss": 0.7617, "step": 67000 }, { "epoch": 9.185632026323006, "eval_accuracy": 0.8437435512372087, "eval_loss": 0.8751281499862671, "eval_runtime": 256.1107, "eval_samples_per_second": 158.533, "eval_steps_per_second": 4.955, "step": 67000 }, { "epoch": 9.25418151905676, "grad_norm": 2.581777334213257, "learning_rate": 3.2500000000000004e-05, "loss": 0.7585, "step": 67500 }, { "epoch": 9.25418151905676, "eval_accuracy": 0.8439763030486503, "eval_loss": 0.8657551407814026, "eval_runtime": 257.7079, "eval_samples_per_second": 157.55, "eval_steps_per_second": 4.924, "step": 67500 }, { "epoch": 9.322731011790513, "grad_norm": 2.997283458709717, "learning_rate": 3.2000000000000005e-05, "loss": 0.7657, "step": 68000 }, { "epoch": 9.322731011790513, "eval_accuracy": 0.8446350018812996, "eval_loss": 0.8639153838157654, "eval_runtime": 256.5945, "eval_samples_per_second": 158.234, "eval_steps_per_second": 4.946, "step": 68000 }, { "epoch": 9.391280504524266, "grad_norm": 2.7763564586639404, "learning_rate": 3.15e-05, "loss": 0.759, "step": 68500 }, { "epoch": 9.391280504524266, "eval_accuracy": 0.8448020556501544, "eval_loss": 0.8533274531364441, "eval_runtime": 256.963, "eval_samples_per_second": 158.007, "eval_steps_per_second": 4.938, "step": 68500 }, { "epoch": 9.459829997258021, "grad_norm": 2.864605665206909, "learning_rate": 3.1e-05, "loss": 0.7574, "step": 69000 }, { "epoch": 9.459829997258021, "eval_accuracy": 0.8444638910956694, "eval_loss": 0.8697899580001831, "eval_runtime": 255.8551, "eval_samples_per_second": 158.691, "eval_steps_per_second": 4.96, "step": 69000 }, { "epoch": 9.528379489991774, "grad_norm": 2.5367231369018555, "learning_rate": 3.05e-05, "loss": 0.7529, "step": 69500 }, { "epoch": 9.528379489991774, "eval_accuracy": 0.8451616733099838, "eval_loss": 0.8582028746604919, "eval_runtime": 256.6803, "eval_samples_per_second": 158.181, "eval_steps_per_second": 4.944, "step": 69500 }, { "epoch": 9.596928982725528, "grad_norm": 2.1710877418518066, "learning_rate": 3e-05, "loss": 0.7577, "step": 70000 }, { "epoch": 9.596928982725528, "eval_accuracy": 0.845897620114414, "eval_loss": 0.8616137504577637, "eval_runtime": 255.6697, "eval_samples_per_second": 158.806, "eval_steps_per_second": 4.963, "step": 70000 }, { "epoch": 9.665478475459281, "grad_norm": 2.400867462158203, "learning_rate": 2.95e-05, "loss": 0.7554, "step": 70500 }, { "epoch": 9.665478475459281, "eval_accuracy": 0.8460372178025299, "eval_loss": 0.8466119766235352, "eval_runtime": 256.0692, "eval_samples_per_second": 158.559, "eval_steps_per_second": 4.956, "step": 70500 }, { "epoch": 9.734027968193036, "grad_norm": 2.6465237140655518, "learning_rate": 2.9e-05, "loss": 0.7406, "step": 71000 }, { "epoch": 9.734027968193036, "eval_accuracy": 0.846062622809469, "eval_loss": 0.8593913316726685, "eval_runtime": 257.0976, "eval_samples_per_second": 157.924, "eval_steps_per_second": 4.936, "step": 71000 }, { "epoch": 9.80257746092679, "grad_norm": 2.72021222114563, "learning_rate": 2.8499999999999998e-05, "loss": 0.7543, "step": 71500 }, { "epoch": 9.80257746092679, "eval_accuracy": 0.8463149479101548, "eval_loss": 0.8515172600746155, "eval_runtime": 256.7199, "eval_samples_per_second": 158.157, "eval_steps_per_second": 4.943, "step": 71500 }, { "epoch": 9.871126953660543, "grad_norm": 2.639139175415039, "learning_rate": 2.8000000000000003e-05, "loss": 0.7506, "step": 72000 }, { "epoch": 9.871126953660543, "eval_accuracy": 0.8465278129829884, "eval_loss": 0.8525589108467102, "eval_runtime": 258.3778, "eval_samples_per_second": 157.142, "eval_steps_per_second": 4.911, "step": 72000 }, { "epoch": 9.939676446394296, "grad_norm": 2.2242422103881836, "learning_rate": 2.7500000000000004e-05, "loss": 0.7517, "step": 72500 }, { "epoch": 9.939676446394296, "eval_accuracy": 0.8470482563336492, "eval_loss": 0.8529332876205444, "eval_runtime": 257.3598, "eval_samples_per_second": 157.764, "eval_steps_per_second": 4.931, "step": 72500 }, { "epoch": 10.008225939128051, "grad_norm": 2.333671808242798, "learning_rate": 2.7000000000000002e-05, "loss": 0.7425, "step": 73000 }, { "epoch": 10.008225939128051, "eval_accuracy": 0.8467576970668704, "eval_loss": 0.8543536067008972, "eval_runtime": 257.3323, "eval_samples_per_second": 157.78, "eval_steps_per_second": 4.931, "step": 73000 }, { "epoch": 10.076775431861805, "grad_norm": 2.4138877391815186, "learning_rate": 2.6500000000000004e-05, "loss": 0.7464, "step": 73500 }, { "epoch": 10.076775431861805, "eval_accuracy": 0.8473971965940057, "eval_loss": 0.8455188870429993, "eval_runtime": 257.2349, "eval_samples_per_second": 157.84, "eval_steps_per_second": 4.933, "step": 73500 }, { "epoch": 10.145324924595558, "grad_norm": 2.812563180923462, "learning_rate": 2.6000000000000002e-05, "loss": 0.7412, "step": 74000 }, { "epoch": 10.145324924595558, "eval_accuracy": 0.8475976099587939, "eval_loss": 0.8453831076622009, "eval_runtime": 256.9915, "eval_samples_per_second": 157.99, "eval_steps_per_second": 4.938, "step": 74000 }, { "epoch": 10.213874417329311, "grad_norm": 2.369260549545288, "learning_rate": 2.5500000000000003e-05, "loss": 0.7346, "step": 74500 }, { "epoch": 10.213874417329311, "eval_accuracy": 0.8474954397549382, "eval_loss": 0.8512648344039917, "eval_runtime": 258.0335, "eval_samples_per_second": 157.352, "eval_steps_per_second": 4.918, "step": 74500 }, { "epoch": 10.282423910063066, "grad_norm": 2.7622134685516357, "learning_rate": 2.5e-05, "loss": 0.7424, "step": 75000 }, { "epoch": 10.282423910063066, "eval_accuracy": 0.8481833714959756, "eval_loss": 0.8343672156333923, "eval_runtime": 258.2629, "eval_samples_per_second": 157.212, "eval_steps_per_second": 4.914, "step": 75000 }, { "epoch": 10.35097340279682, "grad_norm": 2.2065768241882324, "learning_rate": 2.45e-05, "loss": 0.7364, "step": 75500 }, { "epoch": 10.35097340279682, "eval_accuracy": 0.8482113108532771, "eval_loss": 0.8340145945549011, "eval_runtime": 258.3206, "eval_samples_per_second": 157.177, "eval_steps_per_second": 4.913, "step": 75500 }, { "epoch": 10.419522895530573, "grad_norm": 2.186100721359253, "learning_rate": 2.4e-05, "loss": 0.7409, "step": 76000 }, { "epoch": 10.419522895530573, "eval_accuracy": 0.8489686223957396, "eval_loss": 0.8362465500831604, "eval_runtime": 256.9187, "eval_samples_per_second": 158.034, "eval_steps_per_second": 4.939, "step": 76000 }, { "epoch": 10.488072388264326, "grad_norm": 2.706817626953125, "learning_rate": 2.35e-05, "loss": 0.7353, "step": 76500 }, { "epoch": 10.488072388264326, "eval_accuracy": 0.8487501900476949, "eval_loss": 0.8368015289306641, "eval_runtime": 257.1636, "eval_samples_per_second": 157.884, "eval_steps_per_second": 4.935, "step": 76500 }, { "epoch": 10.556621880998081, "grad_norm": 2.523261308670044, "learning_rate": 2.3000000000000003e-05, "loss": 0.731, "step": 77000 }, { "epoch": 10.556621880998081, "eval_accuracy": 0.8489030226241834, "eval_loss": 0.8337299823760986, "eval_runtime": 256.9915, "eval_samples_per_second": 157.99, "eval_steps_per_second": 4.938, "step": 77000 }, { "epoch": 10.625171373731835, "grad_norm": 2.606250286102295, "learning_rate": 2.25e-05, "loss": 0.7292, "step": 77500 }, { "epoch": 10.625171373731835, "eval_accuracy": 0.8478736538649552, "eval_loss": 0.8499141335487366, "eval_runtime": 258.2456, "eval_samples_per_second": 157.222, "eval_steps_per_second": 4.914, "step": 77500 }, { "epoch": 10.693720866465588, "grad_norm": 2.5361220836639404, "learning_rate": 2.2000000000000003e-05, "loss": 0.7359, "step": 78000 }, { "epoch": 10.693720866465588, "eval_accuracy": 0.8490765860082904, "eval_loss": 0.8316646218299866, "eval_runtime": 258.3157, "eval_samples_per_second": 157.18, "eval_steps_per_second": 4.913, "step": 78000 }, { "epoch": 10.762270359199341, "grad_norm": 2.3277316093444824, "learning_rate": 2.15e-05, "loss": 0.7284, "step": 78500 }, { "epoch": 10.762270359199341, "eval_accuracy": 0.8495720225312002, "eval_loss": 0.8365707397460938, "eval_runtime": 258.2814, "eval_samples_per_second": 157.201, "eval_steps_per_second": 4.913, "step": 78500 }, { "epoch": 10.830819851933096, "grad_norm": 2.746189832687378, "learning_rate": 2.1e-05, "loss": 0.7316, "step": 79000 }, { "epoch": 10.830819851933096, "eval_accuracy": 0.8500147906280335, "eval_loss": 0.8251886963844299, "eval_runtime": 257.4381, "eval_samples_per_second": 157.716, "eval_steps_per_second": 4.929, "step": 79000 }, { "epoch": 10.89936934466685, "grad_norm": 2.9917354583740234, "learning_rate": 2.05e-05, "loss": 0.7304, "step": 79500 }, { "epoch": 10.89936934466685, "eval_accuracy": 0.8502983624757147, "eval_loss": 0.8259178996086121, "eval_runtime": 257.2855, "eval_samples_per_second": 157.809, "eval_steps_per_second": 4.932, "step": 79500 }, { "epoch": 10.967918837400603, "grad_norm": 2.275324583053589, "learning_rate": 2e-05, "loss": 0.7255, "step": 80000 }, { "epoch": 10.967918837400603, "eval_accuracy": 0.8505023638996737, "eval_loss": 0.8250493407249451, "eval_runtime": 257.2933, "eval_samples_per_second": 157.804, "eval_steps_per_second": 4.932, "step": 80000 }, { "epoch": 11.036468330134356, "grad_norm": 2.6113440990448, "learning_rate": 1.9500000000000003e-05, "loss": 0.7224, "step": 80500 }, { "epoch": 11.036468330134356, "eval_accuracy": 0.8507015443771133, "eval_loss": 0.8299734592437744, "eval_runtime": 257.4676, "eval_samples_per_second": 157.698, "eval_steps_per_second": 4.929, "step": 80500 }, { "epoch": 11.105017822868112, "grad_norm": 2.6536693572998047, "learning_rate": 1.9e-05, "loss": 0.7208, "step": 81000 }, { "epoch": 11.105017822868112, "eval_accuracy": 0.8506463117116102, "eval_loss": 0.8155694007873535, "eval_runtime": 256.8592, "eval_samples_per_second": 158.071, "eval_steps_per_second": 4.94, "step": 81000 }, { "epoch": 11.173567315601865, "grad_norm": 2.290782928466797, "learning_rate": 1.85e-05, "loss": 0.7148, "step": 81500 }, { "epoch": 11.173567315601865, "eval_accuracy": 0.8507982915676063, "eval_loss": 0.8275089859962463, "eval_runtime": 257.1589, "eval_samples_per_second": 157.887, "eval_steps_per_second": 4.935, "step": 81500 }, { "epoch": 11.242116808335618, "grad_norm": 2.6533780097961426, "learning_rate": 1.8e-05, "loss": 0.7193, "step": 82000 }, { "epoch": 11.242116808335618, "eval_accuracy": 0.8510864940426184, "eval_loss": 0.8217721581459045, "eval_runtime": 258.1645, "eval_samples_per_second": 157.272, "eval_steps_per_second": 4.915, "step": 82000 }, { "epoch": 11.310666301069372, "grad_norm": 2.6084372997283936, "learning_rate": 1.75e-05, "loss": 0.7177, "step": 82500 }, { "epoch": 11.310666301069372, "eval_accuracy": 0.8508369557490207, "eval_loss": 0.8289022445678711, "eval_runtime": 258.043, "eval_samples_per_second": 157.346, "eval_steps_per_second": 4.918, "step": 82500 }, { "epoch": 11.379215793803127, "grad_norm": 2.2717843055725098, "learning_rate": 1.7000000000000003e-05, "loss": 0.7211, "step": 83000 }, { "epoch": 11.379215793803127, "eval_accuracy": 0.8514459749572929, "eval_loss": 0.8198857307434082, "eval_runtime": 258.3084, "eval_samples_per_second": 157.184, "eval_steps_per_second": 4.913, "step": 83000 }, { "epoch": 11.44776528653688, "grad_norm": 2.34387469291687, "learning_rate": 1.65e-05, "loss": 0.7093, "step": 83500 }, { "epoch": 11.44776528653688, "eval_accuracy": 0.8511664827348943, "eval_loss": 0.8272643089294434, "eval_runtime": 258.1866, "eval_samples_per_second": 157.258, "eval_steps_per_second": 4.915, "step": 83500 }, { "epoch": 11.516314779270633, "grad_norm": 2.3854498863220215, "learning_rate": 1.6000000000000003e-05, "loss": 0.7154, "step": 84000 }, { "epoch": 11.516314779270633, "eval_accuracy": 0.8518011662252298, "eval_loss": 0.8211445212364197, "eval_runtime": 257.2815, "eval_samples_per_second": 157.812, "eval_steps_per_second": 4.932, "step": 84000 }, { "epoch": 11.584864272004387, "grad_norm": 2.4457602500915527, "learning_rate": 1.55e-05, "loss": 0.7178, "step": 84500 }, { "epoch": 11.584864272004387, "eval_accuracy": 0.8521036808235916, "eval_loss": 0.8183203339576721, "eval_runtime": 258.3778, "eval_samples_per_second": 157.142, "eval_steps_per_second": 4.911, "step": 84500 }, { "epoch": 11.653413764738142, "grad_norm": 2.5457184314727783, "learning_rate": 1.5e-05, "loss": 0.716, "step": 85000 }, { "epoch": 11.653413764738142, "eval_accuracy": 0.8522289156626506, "eval_loss": 0.8176619410514832, "eval_runtime": 256.9844, "eval_samples_per_second": 157.994, "eval_steps_per_second": 4.938, "step": 85000 }, { "epoch": 11.721963257471895, "grad_norm": 2.855541467666626, "learning_rate": 1.45e-05, "loss": 0.7081, "step": 85500 }, { "epoch": 11.721963257471895, "eval_accuracy": 0.8522235192962014, "eval_loss": 0.8102879524230957, "eval_runtime": 257.0844, "eval_samples_per_second": 157.933, "eval_steps_per_second": 4.936, "step": 85500 }, { "epoch": 11.790512750205648, "grad_norm": 2.44242787361145, "learning_rate": 1.4000000000000001e-05, "loss": 0.7112, "step": 86000 }, { "epoch": 11.790512750205648, "eval_accuracy": 0.8531087746062566, "eval_loss": 0.8167855739593506, "eval_runtime": 258.2921, "eval_samples_per_second": 157.194, "eval_steps_per_second": 4.913, "step": 86000 }, { "epoch": 11.859062242939402, "grad_norm": 2.559410333633423, "learning_rate": 1.3500000000000001e-05, "loss": 0.7089, "step": 86500 }, { "epoch": 11.859062242939402, "eval_accuracy": 0.8524401167338927, "eval_loss": 0.8145312666893005, "eval_runtime": 258.2263, "eval_samples_per_second": 157.234, "eval_steps_per_second": 4.914, "step": 86500 }, { "epoch": 11.927611735673157, "grad_norm": 2.3273496627807617, "learning_rate": 1.3000000000000001e-05, "loss": 0.7033, "step": 87000 }, { "epoch": 11.927611735673157, "eval_accuracy": 0.8526681918388697, "eval_loss": 0.8153809309005737, "eval_runtime": 257.196, "eval_samples_per_second": 157.864, "eval_steps_per_second": 4.934, "step": 87000 }, { "epoch": 11.99616122840691, "grad_norm": 2.354360580444336, "learning_rate": 1.25e-05, "loss": 0.7029, "step": 87500 }, { "epoch": 11.99616122840691, "eval_accuracy": 0.8531567384453895, "eval_loss": 0.8229334354400635, "eval_runtime": 257.3002, "eval_samples_per_second": 157.8, "eval_steps_per_second": 4.932, "step": 87500 }, { "epoch": 12.064710721140663, "grad_norm": 2.4728591442108154, "learning_rate": 1.2e-05, "loss": 0.7058, "step": 88000 }, { "epoch": 12.064710721140663, "eval_accuracy": 0.8537915504584094, "eval_loss": 0.8140564560890198, "eval_runtime": 258.2835, "eval_samples_per_second": 157.199, "eval_steps_per_second": 4.913, "step": 88000 }, { "epoch": 12.133260213874417, "grad_norm": 2.485384702682495, "learning_rate": 1.1500000000000002e-05, "loss": 0.7005, "step": 88500 }, { "epoch": 12.133260213874417, "eval_accuracy": 0.8534658761375271, "eval_loss": 0.8151687383651733, "eval_runtime": 258.2434, "eval_samples_per_second": 157.224, "eval_steps_per_second": 4.914, "step": 88500 }, { "epoch": 12.201809706608172, "grad_norm": 2.530062198638916, "learning_rate": 1.1000000000000001e-05, "loss": 0.6992, "step": 89000 }, { "epoch": 12.201809706608172, "eval_accuracy": 0.853826029943314, "eval_loss": 0.8016021847724915, "eval_runtime": 258.2461, "eval_samples_per_second": 157.222, "eval_steps_per_second": 4.914, "step": 89000 }, { "epoch": 12.270359199341925, "grad_norm": 2.5869436264038086, "learning_rate": 1.05e-05, "loss": 0.7008, "step": 89500 }, { "epoch": 12.270359199341925, "eval_accuracy": 0.8535235114498525, "eval_loss": 0.8112274408340454, "eval_runtime": 257.5365, "eval_samples_per_second": 157.655, "eval_steps_per_second": 4.927, "step": 89500 }, { "epoch": 12.338908692075679, "grad_norm": 2.8641934394836426, "learning_rate": 1e-05, "loss": 0.6979, "step": 90000 }, { "epoch": 12.338908692075679, "eval_accuracy": 0.8538098856943305, "eval_loss": 0.8109295964241028, "eval_runtime": 258.3955, "eval_samples_per_second": 157.131, "eval_steps_per_second": 4.911, "step": 90000 }, { "epoch": 12.407458184809432, "grad_norm": 2.686566114425659, "learning_rate": 9.5e-06, "loss": 0.6949, "step": 90500 }, { "epoch": 12.407458184809432, "eval_accuracy": 0.8543918997918827, "eval_loss": 0.8125308156013489, "eval_runtime": 258.1617, "eval_samples_per_second": 157.274, "eval_steps_per_second": 4.916, "step": 90500 }, { "epoch": 12.476007677543187, "grad_norm": 2.452526569366455, "learning_rate": 9e-06, "loss": 0.6946, "step": 91000 }, { "epoch": 12.476007677543187, "eval_accuracy": 0.8538440239723134, "eval_loss": 0.8097832798957825, "eval_runtime": 258.2364, "eval_samples_per_second": 157.228, "eval_steps_per_second": 4.914, "step": 91000 }, { "epoch": 12.54455717027694, "grad_norm": 2.463740825653076, "learning_rate": 8.500000000000002e-06, "loss": 0.6939, "step": 91500 }, { "epoch": 12.54455717027694, "eval_accuracy": 0.854438946024891, "eval_loss": 0.7999902963638306, "eval_runtime": 257.2634, "eval_samples_per_second": 157.823, "eval_steps_per_second": 4.933, "step": 91500 }, { "epoch": 12.613106663010694, "grad_norm": 2.547820568084717, "learning_rate": 8.000000000000001e-06, "loss": 0.6969, "step": 92000 }, { "epoch": 12.613106663010694, "eval_accuracy": 0.8542858590534823, "eval_loss": 0.8070544004440308, "eval_runtime": 257.3491, "eval_samples_per_second": 157.77, "eval_steps_per_second": 4.931, "step": 92000 }, { "epoch": 12.681656155744447, "grad_norm": 2.2731072902679443, "learning_rate": 7.5e-06, "loss": 0.6967, "step": 92500 }, { "epoch": 12.681656155744447, "eval_accuracy": 0.8545510749739543, "eval_loss": 0.7984638810157776, "eval_runtime": 258.6739, "eval_samples_per_second": 156.962, "eval_steps_per_second": 4.906, "step": 92500 }, { "epoch": 12.750205648478202, "grad_norm": 2.242337226867676, "learning_rate": 7.000000000000001e-06, "loss": 0.6944, "step": 93000 }, { "epoch": 12.750205648478202, "eval_accuracy": 0.85512864037364, "eval_loss": 0.7989787459373474, "eval_runtime": 258.5499, "eval_samples_per_second": 157.037, "eval_steps_per_second": 4.908, "step": 93000 }, { "epoch": 12.818755141211955, "grad_norm": 2.4914486408233643, "learning_rate": 6.5000000000000004e-06, "loss": 0.6885, "step": 93500 }, { "epoch": 12.818755141211955, "eval_accuracy": 0.8550956587798725, "eval_loss": 0.8170965313911438, "eval_runtime": 258.3692, "eval_samples_per_second": 157.147, "eval_steps_per_second": 4.912, "step": 93500 }, { "epoch": 12.887304633945709, "grad_norm": 2.1909425258636475, "learning_rate": 6e-06, "loss": 0.6897, "step": 94000 }, { "epoch": 12.887304633945709, "eval_accuracy": 0.8550768677242255, "eval_loss": 0.8014948964118958, "eval_runtime": 258.4993, "eval_samples_per_second": 157.068, "eval_steps_per_second": 4.909, "step": 94000 }, { "epoch": 12.955854126679462, "grad_norm": 2.7882330417633057, "learning_rate": 5.500000000000001e-06, "loss": 0.7027, "step": 94500 }, { "epoch": 12.955854126679462, "eval_accuracy": 0.8551748096354626, "eval_loss": 0.8074929118156433, "eval_runtime": 258.4421, "eval_samples_per_second": 157.103, "eval_steps_per_second": 4.91, "step": 94500 }, { "epoch": 13.024403619413217, "grad_norm": 2.189662218093872, "learning_rate": 5e-06, "loss": 0.6926, "step": 95000 }, { "epoch": 13.024403619413217, "eval_accuracy": 0.8554257092266915, "eval_loss": 0.8118977546691895, "eval_runtime": 257.8701, "eval_samples_per_second": 157.451, "eval_steps_per_second": 4.921, "step": 95000 }, { "epoch": 13.09295311214697, "grad_norm": 2.4796793460845947, "learning_rate": 4.5e-06, "loss": 0.697, "step": 95500 }, { "epoch": 13.09295311214697, "eval_accuracy": 0.8558198770391428, "eval_loss": 0.7951220870018005, "eval_runtime": 258.6851, "eval_samples_per_second": 156.955, "eval_steps_per_second": 4.906, "step": 95500 }, { "epoch": 13.161502604880724, "grad_norm": 2.475494146347046, "learning_rate": 4.000000000000001e-06, "loss": 0.6814, "step": 96000 }, { "epoch": 13.161502604880724, "eval_accuracy": 0.8557974835152675, "eval_loss": 0.7992942929267883, "eval_runtime": 258.6884, "eval_samples_per_second": 156.953, "eval_steps_per_second": 4.906, "step": 96000 }, { "epoch": 13.230052097614477, "grad_norm": 2.662364959716797, "learning_rate": 3.5000000000000004e-06, "loss": 0.687, "step": 96500 }, { "epoch": 13.230052097614477, "eval_accuracy": 0.8556396210250248, "eval_loss": 0.7970269322395325, "eval_runtime": 257.7318, "eval_samples_per_second": 157.536, "eval_steps_per_second": 4.924, "step": 96500 }, { "epoch": 13.298601590348232, "grad_norm": 2.5556256771087646, "learning_rate": 3e-06, "loss": 0.6956, "step": 97000 }, { "epoch": 13.298601590348232, "eval_accuracy": 0.8560198209787908, "eval_loss": 0.7952587008476257, "eval_runtime": 258.556, "eval_samples_per_second": 157.034, "eval_steps_per_second": 4.908, "step": 97000 }, { "epoch": 13.367151083081986, "grad_norm": 2.327164888381958, "learning_rate": 2.5e-06, "loss": 0.6821, "step": 97500 }, { "epoch": 13.367151083081986, "eval_accuracy": 0.855291832818916, "eval_loss": 0.8007811307907104, "eval_runtime": 257.5846, "eval_samples_per_second": 157.626, "eval_steps_per_second": 4.927, "step": 97500 }, { "epoch": 13.435700575815739, "grad_norm": 2.408548593521118, "learning_rate": 2.0000000000000003e-06, "loss": 0.6846, "step": 98000 }, { "epoch": 13.435700575815739, "eval_accuracy": 0.8561709926629, "eval_loss": 0.7897204756736755, "eval_runtime": 258.6709, "eval_samples_per_second": 156.964, "eval_steps_per_second": 4.906, "step": 98000 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.262679236758733e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }