|
{ |
|
"best_metric": 0.7897204756736755, |
|
"best_model_checkpoint": "./model_fine-tune/glot/mbert/ron-Latn/checkpoint-98000", |
|
"epoch": 13.435700575815739, |
|
"eval_steps": 500, |
|
"global_step": 98000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06854949273375377, |
|
"grad_norm": 4.000861644744873, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.7568, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06854949273375377, |
|
"eval_accuracy": 0.7095725691837217, |
|
"eval_loss": 1.6891406774520874, |
|
"eval_runtime": 259.6177, |
|
"eval_samples_per_second": 156.391, |
|
"eval_steps_per_second": 4.888, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13709898546750754, |
|
"grad_norm": 4.238007545471191, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.5655, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.13709898546750754, |
|
"eval_accuracy": 0.7265132414723929, |
|
"eval_loss": 1.5632870197296143, |
|
"eval_runtime": 260.4774, |
|
"eval_samples_per_second": 155.875, |
|
"eval_steps_per_second": 4.872, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2056484782012613, |
|
"grad_norm": 3.5532774925231934, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.4944, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2056484782012613, |
|
"eval_accuracy": 0.7360105726108715, |
|
"eval_loss": 1.4989327192306519, |
|
"eval_runtime": 260.0937, |
|
"eval_samples_per_second": 156.105, |
|
"eval_steps_per_second": 4.879, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2741979709350151, |
|
"grad_norm": 3.5770881175994873, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.4324, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2741979709350151, |
|
"eval_accuracy": 0.743094448707599, |
|
"eval_loss": 1.4517085552215576, |
|
"eval_runtime": 260.1265, |
|
"eval_samples_per_second": 156.086, |
|
"eval_steps_per_second": 4.878, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.34274746366876885, |
|
"grad_norm": 3.5476059913635254, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.3797, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.34274746366876885, |
|
"eval_accuracy": 0.7503641171887161, |
|
"eval_loss": 1.4144247770309448, |
|
"eval_runtime": 258.9987, |
|
"eval_samples_per_second": 156.765, |
|
"eval_steps_per_second": 4.9, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4112969564025226, |
|
"grad_norm": 3.64892578125, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.3435, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4112969564025226, |
|
"eval_accuracy": 0.7549876281769841, |
|
"eval_loss": 1.380112648010254, |
|
"eval_runtime": 258.9885, |
|
"eval_samples_per_second": 156.771, |
|
"eval_steps_per_second": 4.9, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4798464491362764, |
|
"grad_norm": 3.1375253200531006, |
|
"learning_rate": 9.65e-05, |
|
"loss": 1.3133, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4798464491362764, |
|
"eval_accuracy": 0.7577493236618825, |
|
"eval_loss": 1.374040126800537, |
|
"eval_runtime": 259.0671, |
|
"eval_samples_per_second": 156.724, |
|
"eval_steps_per_second": 4.898, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5483959418700302, |
|
"grad_norm": 3.1487910747528076, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.2891, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5483959418700302, |
|
"eval_accuracy": 0.76247168256945, |
|
"eval_loss": 1.34718918800354, |
|
"eval_runtime": 259.0025, |
|
"eval_samples_per_second": 156.763, |
|
"eval_steps_per_second": 4.9, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6169454346037839, |
|
"grad_norm": 2.947143316268921, |
|
"learning_rate": 9.55e-05, |
|
"loss": 1.2764, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6169454346037839, |
|
"eval_accuracy": 0.7652144022180228, |
|
"eval_loss": 1.3174166679382324, |
|
"eval_runtime": 258.6684, |
|
"eval_samples_per_second": 156.965, |
|
"eval_steps_per_second": 4.906, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6854949273375377, |
|
"grad_norm": 3.274010181427002, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.2492, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6854949273375377, |
|
"eval_accuracy": 0.7671819928468485, |
|
"eval_loss": 1.313983678817749, |
|
"eval_runtime": 258.6625, |
|
"eval_samples_per_second": 156.969, |
|
"eval_steps_per_second": 4.906, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7540444200712915, |
|
"grad_norm": 3.109161138534546, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 1.2329, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7540444200712915, |
|
"eval_accuracy": 0.7713562268262496, |
|
"eval_loss": 1.2868432998657227, |
|
"eval_runtime": 260.0165, |
|
"eval_samples_per_second": 156.152, |
|
"eval_steps_per_second": 4.88, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.8225939128050452, |
|
"grad_norm": 7.044505596160889, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.2232, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8225939128050452, |
|
"eval_accuracy": 0.7731761649539602, |
|
"eval_loss": 1.2668291330337524, |
|
"eval_runtime": 258.9434, |
|
"eval_samples_per_second": 156.799, |
|
"eval_steps_per_second": 4.901, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.891143405538799, |
|
"grad_norm": 3.029754400253296, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 1.2117, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.891143405538799, |
|
"eval_accuracy": 0.7750832812825318, |
|
"eval_loss": 1.2626760005950928, |
|
"eval_runtime": 260.4421, |
|
"eval_samples_per_second": 155.896, |
|
"eval_steps_per_second": 4.872, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.9596928982725528, |
|
"grad_norm": 2.961531639099121, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 1.1924, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9596928982725528, |
|
"eval_accuracy": 0.7771516510947648, |
|
"eval_loss": 1.2538079023361206, |
|
"eval_runtime": 260.0863, |
|
"eval_samples_per_second": 156.11, |
|
"eval_steps_per_second": 4.879, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0282423910063065, |
|
"grad_norm": 3.120314121246338, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 1.1807, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.0282423910063065, |
|
"eval_accuracy": 0.7786684520598107, |
|
"eval_loss": 1.247827410697937, |
|
"eval_runtime": 258.2089, |
|
"eval_samples_per_second": 157.245, |
|
"eval_steps_per_second": 4.915, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.0967918837400603, |
|
"grad_norm": 3.0152571201324463, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.1666, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0967918837400603, |
|
"eval_accuracy": 0.7806228052923374, |
|
"eval_loss": 1.2236727476119995, |
|
"eval_runtime": 258.1981, |
|
"eval_samples_per_second": 157.251, |
|
"eval_steps_per_second": 4.915, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.165341376473814, |
|
"grad_norm": 4.560582637786865, |
|
"learning_rate": 9.15e-05, |
|
"loss": 1.1582, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.165341376473814, |
|
"eval_accuracy": 0.7811331658792164, |
|
"eval_loss": 1.2169686555862427, |
|
"eval_runtime": 257.413, |
|
"eval_samples_per_second": 157.731, |
|
"eval_steps_per_second": 4.93, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.2338908692075679, |
|
"grad_norm": 2.940659523010254, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.1376, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.2338908692075679, |
|
"eval_accuracy": 0.7844609998371601, |
|
"eval_loss": 1.1992230415344238, |
|
"eval_runtime": 257.3928, |
|
"eval_samples_per_second": 157.743, |
|
"eval_steps_per_second": 4.93, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.3024403619413216, |
|
"grad_norm": 3.4134812355041504, |
|
"learning_rate": 9.05e-05, |
|
"loss": 1.1358, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.3024403619413216, |
|
"eval_accuracy": 0.7851634886822189, |
|
"eval_loss": 1.2106844186782837, |
|
"eval_runtime": 257.3145, |
|
"eval_samples_per_second": 157.791, |
|
"eval_steps_per_second": 4.932, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.3709898546750754, |
|
"grad_norm": 2.8850438594818115, |
|
"learning_rate": 9e-05, |
|
"loss": 1.1281, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.3709898546750754, |
|
"eval_accuracy": 0.7872160784699163, |
|
"eval_loss": 1.2045215368270874, |
|
"eval_runtime": 257.4649, |
|
"eval_samples_per_second": 157.699, |
|
"eval_steps_per_second": 4.929, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.4395393474088292, |
|
"grad_norm": 2.752389669418335, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 1.119, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.4395393474088292, |
|
"eval_accuracy": 0.7882877785798362, |
|
"eval_loss": 1.1866884231567383, |
|
"eval_runtime": 257.0898, |
|
"eval_samples_per_second": 157.929, |
|
"eval_steps_per_second": 4.936, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.508088840142583, |
|
"grad_norm": 3.4821131229400635, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.1052, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.508088840142583, |
|
"eval_accuracy": 0.7891213465142315, |
|
"eval_loss": 1.177182912826538, |
|
"eval_runtime": 257.3303, |
|
"eval_samples_per_second": 157.782, |
|
"eval_steps_per_second": 4.931, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.5766383328763367, |
|
"grad_norm": 2.641080379486084, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 1.0969, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.5766383328763367, |
|
"eval_accuracy": 0.790978984380183, |
|
"eval_loss": 1.1677839756011963, |
|
"eval_runtime": 257.8775, |
|
"eval_samples_per_second": 157.447, |
|
"eval_steps_per_second": 4.921, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.6451878256100905, |
|
"grad_norm": 2.670703172683716, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.0973, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.6451878256100905, |
|
"eval_accuracy": 0.7914186279008998, |
|
"eval_loss": 1.1657401323318481, |
|
"eval_runtime": 257.6044, |
|
"eval_samples_per_second": 157.614, |
|
"eval_steps_per_second": 4.926, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.7137373183438442, |
|
"grad_norm": 4.184514045715332, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.0931, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.7137373183438442, |
|
"eval_accuracy": 0.7924260144852202, |
|
"eval_loss": 1.1656056642532349, |
|
"eval_runtime": 257.7225, |
|
"eval_samples_per_second": 157.542, |
|
"eval_steps_per_second": 4.924, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.782286811077598, |
|
"grad_norm": 3.4420437812805176, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.0882, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.782286811077598, |
|
"eval_accuracy": 0.7937018737958299, |
|
"eval_loss": 1.159055471420288, |
|
"eval_runtime": 257.6253, |
|
"eval_samples_per_second": 157.601, |
|
"eval_steps_per_second": 4.926, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.8508363038113518, |
|
"grad_norm": 2.942854642868042, |
|
"learning_rate": 8.65e-05, |
|
"loss": 1.0802, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.8508363038113518, |
|
"eval_accuracy": 0.7948528675102821, |
|
"eval_loss": 1.1472080945968628, |
|
"eval_runtime": 257.8341, |
|
"eval_samples_per_second": 157.473, |
|
"eval_steps_per_second": 4.922, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.9193857965451055, |
|
"grad_norm": 2.382511854171753, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.0766, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.9193857965451055, |
|
"eval_accuracy": 0.7966625616136973, |
|
"eval_loss": 1.1392544507980347, |
|
"eval_runtime": 258.8105, |
|
"eval_samples_per_second": 156.879, |
|
"eval_steps_per_second": 4.903, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.9879352892788593, |
|
"grad_norm": 2.583773374557495, |
|
"learning_rate": 8.55e-05, |
|
"loss": 1.0719, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.9879352892788593, |
|
"eval_accuracy": 0.7972140563559621, |
|
"eval_loss": 1.1423135995864868, |
|
"eval_runtime": 257.6602, |
|
"eval_samples_per_second": 157.58, |
|
"eval_steps_per_second": 4.925, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.056484782012613, |
|
"grad_norm": 2.788512945175171, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.0553, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.056484782012613, |
|
"eval_accuracy": 0.7980805136441053, |
|
"eval_loss": 1.128947377204895, |
|
"eval_runtime": 257.7272, |
|
"eval_samples_per_second": 157.539, |
|
"eval_steps_per_second": 4.924, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.125034274746367, |
|
"grad_norm": 2.9311256408691406, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 1.0451, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.125034274746367, |
|
"eval_accuracy": 0.7988239883829299, |
|
"eval_loss": 1.1336219310760498, |
|
"eval_runtime": 258.1352, |
|
"eval_samples_per_second": 157.29, |
|
"eval_steps_per_second": 4.916, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.1935837674801206, |
|
"grad_norm": 3.1457791328430176, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.0423, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.1935837674801206, |
|
"eval_accuracy": 0.7996375561822584, |
|
"eval_loss": 1.1223351955413818, |
|
"eval_runtime": 258.998, |
|
"eval_samples_per_second": 156.766, |
|
"eval_steps_per_second": 4.9, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.2621332602138744, |
|
"grad_norm": 2.9234752655029297, |
|
"learning_rate": 8.35e-05, |
|
"loss": 1.0449, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.2621332602138744, |
|
"eval_accuracy": 0.8002099641553145, |
|
"eval_loss": 1.1050431728363037, |
|
"eval_runtime": 258.0627, |
|
"eval_samples_per_second": 157.334, |
|
"eval_steps_per_second": 4.917, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.330682752947628, |
|
"grad_norm": 2.9982569217681885, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.0312, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.330682752947628, |
|
"eval_accuracy": 0.8017049854116234, |
|
"eval_loss": 1.1168311834335327, |
|
"eval_runtime": 258.8729, |
|
"eval_samples_per_second": 156.841, |
|
"eval_steps_per_second": 4.902, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.399232245681382, |
|
"grad_norm": 2.63649845123291, |
|
"learning_rate": 8.25e-05, |
|
"loss": 1.0291, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.399232245681382, |
|
"eval_accuracy": 0.8025579020578063, |
|
"eval_loss": 1.1074473857879639, |
|
"eval_runtime": 257.7749, |
|
"eval_samples_per_second": 157.51, |
|
"eval_steps_per_second": 4.923, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.4677817384151357, |
|
"grad_norm": 3.122042417526245, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.0245, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.4677817384151357, |
|
"eval_accuracy": 0.802570536529715, |
|
"eval_loss": 1.1076058149337769, |
|
"eval_runtime": 258.567, |
|
"eval_samples_per_second": 157.027, |
|
"eval_steps_per_second": 4.908, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.5363312311488895, |
|
"grad_norm": 3.2931995391845703, |
|
"learning_rate": 8.15e-05, |
|
"loss": 1.0236, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.5363312311488895, |
|
"eval_accuracy": 0.803701212821904, |
|
"eval_loss": 1.092323660850525, |
|
"eval_runtime": 257.5769, |
|
"eval_samples_per_second": 157.631, |
|
"eval_steps_per_second": 4.927, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.6048807238826432, |
|
"grad_norm": 3.31691837310791, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.0218, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.6048807238826432, |
|
"eval_accuracy": 0.8040252089042342, |
|
"eval_loss": 1.0852642059326172, |
|
"eval_runtime": 258.7184, |
|
"eval_samples_per_second": 156.935, |
|
"eval_steps_per_second": 4.905, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.673430216616397, |
|
"grad_norm": 2.602132558822632, |
|
"learning_rate": 8.05e-05, |
|
"loss": 1.0101, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.673430216616397, |
|
"eval_accuracy": 0.8049598014763766, |
|
"eval_loss": 1.0851576328277588, |
|
"eval_runtime": 259.1257, |
|
"eval_samples_per_second": 156.688, |
|
"eval_steps_per_second": 4.897, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.741979709350151, |
|
"grad_norm": 2.6089420318603516, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0154, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.741979709350151, |
|
"eval_accuracy": 0.8060563142838197, |
|
"eval_loss": 1.0850160121917725, |
|
"eval_runtime": 257.7528, |
|
"eval_samples_per_second": 157.523, |
|
"eval_steps_per_second": 4.923, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.8105292020839046, |
|
"grad_norm": 2.57804536819458, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 1.0092, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.8105292020839046, |
|
"eval_accuracy": 0.8063966673859484, |
|
"eval_loss": 1.064876675605774, |
|
"eval_runtime": 257.3862, |
|
"eval_samples_per_second": 157.747, |
|
"eval_steps_per_second": 4.93, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.8790786948176583, |
|
"grad_norm": 2.919243097305298, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.9962, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.8790786948176583, |
|
"eval_accuracy": 0.8074074812889718, |
|
"eval_loss": 1.0758228302001953, |
|
"eval_runtime": 257.4397, |
|
"eval_samples_per_second": 157.715, |
|
"eval_steps_per_second": 4.929, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.947628187551412, |
|
"grad_norm": 2.7142975330352783, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 0.9949, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.947628187551412, |
|
"eval_accuracy": 0.8072095077707074, |
|
"eval_loss": 1.0723259449005127, |
|
"eval_runtime": 257.4852, |
|
"eval_samples_per_second": 157.687, |
|
"eval_steps_per_second": 4.928, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 3.016177680285166, |
|
"grad_norm": 2.461714267730713, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.9933, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.016177680285166, |
|
"eval_accuracy": 0.8092224160930719, |
|
"eval_loss": 1.0564687252044678, |
|
"eval_runtime": 262.5555, |
|
"eval_samples_per_second": 154.642, |
|
"eval_steps_per_second": 4.833, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 3.0847271730189196, |
|
"grad_norm": 3.128793716430664, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.9751, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.0847271730189196, |
|
"eval_accuracy": 0.8092534032416199, |
|
"eval_loss": 1.0642082691192627, |
|
"eval_runtime": 263.2784, |
|
"eval_samples_per_second": 154.217, |
|
"eval_steps_per_second": 4.82, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 3.1532766657526734, |
|
"grad_norm": 2.560393810272217, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.9792, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.1532766657526734, |
|
"eval_accuracy": 0.8098133712563933, |
|
"eval_loss": 1.0663542747497559, |
|
"eval_runtime": 279.6936, |
|
"eval_samples_per_second": 145.166, |
|
"eval_steps_per_second": 4.537, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 3.221826158486427, |
|
"grad_norm": 2.993088483810425, |
|
"learning_rate": 7.65e-05, |
|
"loss": 0.9671, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.221826158486427, |
|
"eval_accuracy": 0.8106374773041439, |
|
"eval_loss": 1.0509235858917236, |
|
"eval_runtime": 302.0862, |
|
"eval_samples_per_second": 134.405, |
|
"eval_steps_per_second": 4.201, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 3.290375651220181, |
|
"grad_norm": 2.796325445175171, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.9667, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.290375651220181, |
|
"eval_accuracy": 0.8117766899143959, |
|
"eval_loss": 1.0434768199920654, |
|
"eval_runtime": 302.1312, |
|
"eval_samples_per_second": 134.385, |
|
"eval_steps_per_second": 4.2, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.3589251439539347, |
|
"grad_norm": 3.067168712615967, |
|
"learning_rate": 7.55e-05, |
|
"loss": 0.9676, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.3589251439539347, |
|
"eval_accuracy": 0.811503557227325, |
|
"eval_loss": 1.0434749126434326, |
|
"eval_runtime": 303.0455, |
|
"eval_samples_per_second": 133.98, |
|
"eval_steps_per_second": 4.187, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 3.4274746366876885, |
|
"grad_norm": 6.064915657043457, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.9659, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.4274746366876885, |
|
"eval_accuracy": 0.8128527567965343, |
|
"eval_loss": 1.0416052341461182, |
|
"eval_runtime": 301.3987, |
|
"eval_samples_per_second": 134.712, |
|
"eval_steps_per_second": 4.21, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 3.4960241294214423, |
|
"grad_norm": 2.412940740585327, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 0.9534, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.4960241294214423, |
|
"eval_accuracy": 0.8132953171379946, |
|
"eval_loss": 1.0385822057724, |
|
"eval_runtime": 300.1799, |
|
"eval_samples_per_second": 135.259, |
|
"eval_steps_per_second": 4.227, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 3.564573622155196, |
|
"grad_norm": 2.9293601512908936, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.9579, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.564573622155196, |
|
"eval_accuracy": 0.8135014968664439, |
|
"eval_loss": 1.0341033935546875, |
|
"eval_runtime": 302.1905, |
|
"eval_samples_per_second": 134.359, |
|
"eval_steps_per_second": 4.199, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 3.63312311488895, |
|
"grad_norm": 2.3857290744781494, |
|
"learning_rate": 7.35e-05, |
|
"loss": 0.9562, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.63312311488895, |
|
"eval_accuracy": 0.8136511493390661, |
|
"eval_loss": 1.0400645732879639, |
|
"eval_runtime": 300.9192, |
|
"eval_samples_per_second": 134.927, |
|
"eval_steps_per_second": 4.217, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 3.7016726076227036, |
|
"grad_norm": 2.8844683170318604, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.9581, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.7016726076227036, |
|
"eval_accuracy": 0.814749558109949, |
|
"eval_loss": 1.0379000902175903, |
|
"eval_runtime": 300.8792, |
|
"eval_samples_per_second": 134.945, |
|
"eval_steps_per_second": 4.218, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 3.7702221003564573, |
|
"grad_norm": 3.2288286685943604, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.9524, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.7702221003564573, |
|
"eval_accuracy": 0.8149326212786671, |
|
"eval_loss": 1.0268869400024414, |
|
"eval_runtime": 303.3311, |
|
"eval_samples_per_second": 133.854, |
|
"eval_steps_per_second": 4.184, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 3.838771593090211, |
|
"grad_norm": 2.84405255317688, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.9366, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.838771593090211, |
|
"eval_accuracy": 0.8165028910386263, |
|
"eval_loss": 1.0258753299713135, |
|
"eval_runtime": 301.0226, |
|
"eval_samples_per_second": 134.88, |
|
"eval_steps_per_second": 4.216, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 3.907321085823965, |
|
"grad_norm": 2.78871488571167, |
|
"learning_rate": 7.15e-05, |
|
"loss": 0.9489, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.907321085823965, |
|
"eval_accuracy": 0.8153732124964858, |
|
"eval_loss": 1.0232901573181152, |
|
"eval_runtime": 303.0362, |
|
"eval_samples_per_second": 133.984, |
|
"eval_steps_per_second": 4.188, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.9758705785577186, |
|
"grad_norm": 2.6128122806549072, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.9372, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.9758705785577186, |
|
"eval_accuracy": 0.8167775740472835, |
|
"eval_loss": 1.0158660411834717, |
|
"eval_runtime": 305.0716, |
|
"eval_samples_per_second": 133.09, |
|
"eval_steps_per_second": 4.16, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 4.044420071291473, |
|
"grad_norm": 2.4649457931518555, |
|
"learning_rate": 7.05e-05, |
|
"loss": 0.9389, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.044420071291473, |
|
"eval_accuracy": 0.8169951094301297, |
|
"eval_loss": 1.0179492235183716, |
|
"eval_runtime": 302.9032, |
|
"eval_samples_per_second": 134.043, |
|
"eval_steps_per_second": 4.189, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 4.112969564025226, |
|
"grad_norm": 2.637385845184326, |
|
"learning_rate": 7e-05, |
|
"loss": 0.9224, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 4.112969564025226, |
|
"eval_accuracy": 0.8178961830392573, |
|
"eval_loss": 1.0067319869995117, |
|
"eval_runtime": 300.9419, |
|
"eval_samples_per_second": 134.916, |
|
"eval_steps_per_second": 4.217, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 4.18151905675898, |
|
"grad_norm": 2.863875389099121, |
|
"learning_rate": 6.95e-05, |
|
"loss": 0.9205, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 4.18151905675898, |
|
"eval_accuracy": 0.8184249147223241, |
|
"eval_loss": 1.0113714933395386, |
|
"eval_runtime": 301.5875, |
|
"eval_samples_per_second": 134.628, |
|
"eval_steps_per_second": 4.208, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 4.250068549492734, |
|
"grad_norm": 2.4322681427001953, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.9247, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.250068549492734, |
|
"eval_accuracy": 0.8186936629410247, |
|
"eval_loss": 1.003678798675537, |
|
"eval_runtime": 303.1297, |
|
"eval_samples_per_second": 133.943, |
|
"eval_steps_per_second": 4.186, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 4.318618042226488, |
|
"grad_norm": 2.998030424118042, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 0.9178, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.318618042226488, |
|
"eval_accuracy": 0.8182408058742606, |
|
"eval_loss": 1.0021617412567139, |
|
"eval_runtime": 302.4668, |
|
"eval_samples_per_second": 134.236, |
|
"eval_steps_per_second": 4.196, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 4.387167534960241, |
|
"grad_norm": 2.5294859409332275, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.9176, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.387167534960241, |
|
"eval_accuracy": 0.8199272207420885, |
|
"eval_loss": 1.0028120279312134, |
|
"eval_runtime": 301.3201, |
|
"eval_samples_per_second": 134.747, |
|
"eval_steps_per_second": 4.211, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.4557170276939955, |
|
"grad_norm": 4.368305206298828, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.9187, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 4.4557170276939955, |
|
"eval_accuracy": 0.8202180503701034, |
|
"eval_loss": 1.0021815299987793, |
|
"eval_runtime": 301.1897, |
|
"eval_samples_per_second": 134.805, |
|
"eval_steps_per_second": 4.213, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 4.524266520427749, |
|
"grad_norm": 3.3230433464050293, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.9169, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 4.524266520427749, |
|
"eval_accuracy": 0.8205231724765897, |
|
"eval_loss": 0.9979987740516663, |
|
"eval_runtime": 300.7001, |
|
"eval_samples_per_second": 135.025, |
|
"eval_steps_per_second": 4.22, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 4.592816013161503, |
|
"grad_norm": 2.592043876647949, |
|
"learning_rate": 6.65e-05, |
|
"loss": 0.9125, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 4.592816013161503, |
|
"eval_accuracy": 0.8206935126919565, |
|
"eval_loss": 0.9938598871231079, |
|
"eval_runtime": 301.9997, |
|
"eval_samples_per_second": 134.444, |
|
"eval_steps_per_second": 4.202, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 4.661365505895256, |
|
"grad_norm": 2.446427345275879, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.9146, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 4.661365505895256, |
|
"eval_accuracy": 0.8216566473447208, |
|
"eval_loss": 0.9849461913108826, |
|
"eval_runtime": 302.9739, |
|
"eval_samples_per_second": 134.012, |
|
"eval_steps_per_second": 4.188, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 4.72991499862901, |
|
"grad_norm": 2.884946346282959, |
|
"learning_rate": 6.55e-05, |
|
"loss": 0.9018, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 4.72991499862901, |
|
"eval_accuracy": 0.8217915724349476, |
|
"eval_loss": 1.0003894567489624, |
|
"eval_runtime": 300.4012, |
|
"eval_samples_per_second": 135.159, |
|
"eval_steps_per_second": 4.224, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 4.798464491362764, |
|
"grad_norm": 2.8886282444000244, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.9014, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.798464491362764, |
|
"eval_accuracy": 0.8224961720866916, |
|
"eval_loss": 0.9889456629753113, |
|
"eval_runtime": 301.0802, |
|
"eval_samples_per_second": 134.854, |
|
"eval_steps_per_second": 4.215, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 4.867013984096518, |
|
"grad_norm": 2.473068952560425, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 0.8919, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 4.867013984096518, |
|
"eval_accuracy": 0.822945545786959, |
|
"eval_loss": 0.9848706722259521, |
|
"eval_runtime": 303.9067, |
|
"eval_samples_per_second": 133.6, |
|
"eval_steps_per_second": 4.176, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 4.935563476830271, |
|
"grad_norm": 3.0716209411621094, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.8993, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 4.935563476830271, |
|
"eval_accuracy": 0.8222583681418539, |
|
"eval_loss": 0.9929753541946411, |
|
"eval_runtime": 299.5059, |
|
"eval_samples_per_second": 135.563, |
|
"eval_steps_per_second": 4.237, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 5.004112969564026, |
|
"grad_norm": 2.3323957920074463, |
|
"learning_rate": 6.35e-05, |
|
"loss": 0.9009, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 5.004112969564026, |
|
"eval_accuracy": 0.8241444257225273, |
|
"eval_loss": 0.9798668622970581, |
|
"eval_runtime": 299.3852, |
|
"eval_samples_per_second": 135.618, |
|
"eval_steps_per_second": 4.239, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 5.072662462297779, |
|
"grad_norm": 2.7152209281921387, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.8843, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 5.072662462297779, |
|
"eval_accuracy": 0.8234012575934279, |
|
"eval_loss": 0.9811968803405762, |
|
"eval_runtime": 300.6715, |
|
"eval_samples_per_second": 135.038, |
|
"eval_steps_per_second": 4.221, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 5.141211955031533, |
|
"grad_norm": 2.526486396789551, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.8846, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 5.141211955031533, |
|
"eval_accuracy": 0.8247555724795991, |
|
"eval_loss": 0.9730820655822754, |
|
"eval_runtime": 299.291, |
|
"eval_samples_per_second": 135.661, |
|
"eval_steps_per_second": 4.24, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 5.2097614477652865, |
|
"grad_norm": 2.5805063247680664, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.8807, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 5.2097614477652865, |
|
"eval_accuracy": 0.8250464067024924, |
|
"eval_loss": 0.9684708118438721, |
|
"eval_runtime": 302.4457, |
|
"eval_samples_per_second": 134.246, |
|
"eval_steps_per_second": 4.196, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 5.278310940499041, |
|
"grad_norm": 2.559605360031128, |
|
"learning_rate": 6.15e-05, |
|
"loss": 0.8802, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 5.278310940499041, |
|
"eval_accuracy": 0.8254996987535631, |
|
"eval_loss": 0.973818838596344, |
|
"eval_runtime": 300.146, |
|
"eval_samples_per_second": 135.274, |
|
"eval_steps_per_second": 4.228, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 5.346860433232794, |
|
"grad_norm": 2.1615304946899414, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.8789, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 5.346860433232794, |
|
"eval_accuracy": 0.8254549864960571, |
|
"eval_loss": 0.9578101634979248, |
|
"eval_runtime": 302.5653, |
|
"eval_samples_per_second": 134.193, |
|
"eval_steps_per_second": 4.194, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 5.415409925966548, |
|
"grad_norm": 2.2763609886169434, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.8843, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 5.415409925966548, |
|
"eval_accuracy": 0.8262787384248012, |
|
"eval_loss": 0.9698151350021362, |
|
"eval_runtime": 301.7649, |
|
"eval_samples_per_second": 134.548, |
|
"eval_steps_per_second": 4.205, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 5.483959418700302, |
|
"grad_norm": 2.3774330615997314, |
|
"learning_rate": 6e-05, |
|
"loss": 0.8714, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 5.483959418700302, |
|
"eval_accuracy": 0.8263406985859876, |
|
"eval_loss": 0.9681651592254639, |
|
"eval_runtime": 302.4298, |
|
"eval_samples_per_second": 134.253, |
|
"eval_steps_per_second": 4.196, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 5.552508911434055, |
|
"grad_norm": 2.3430614471435547, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.8676, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 5.552508911434055, |
|
"eval_accuracy": 0.8263813882499345, |
|
"eval_loss": 0.9515417814254761, |
|
"eval_runtime": 301.876, |
|
"eval_samples_per_second": 134.499, |
|
"eval_steps_per_second": 4.204, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 5.621058404167809, |
|
"grad_norm": 2.3059141635894775, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.8721, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 5.621058404167809, |
|
"eval_accuracy": 0.8275216377261674, |
|
"eval_loss": 0.962243914604187, |
|
"eval_runtime": 302.2447, |
|
"eval_samples_per_second": 134.335, |
|
"eval_steps_per_second": 4.199, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 5.689607896901563, |
|
"grad_norm": 2.462218999862671, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.8699, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 5.689607896901563, |
|
"eval_accuracy": 0.8283203266406056, |
|
"eval_loss": 0.9477165341377258, |
|
"eval_runtime": 302.7485, |
|
"eval_samples_per_second": 134.111, |
|
"eval_steps_per_second": 4.192, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 5.758157389635317, |
|
"grad_norm": 3.0347349643707275, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.8634, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 5.758157389635317, |
|
"eval_accuracy": 0.8281108707620414, |
|
"eval_loss": 0.9486715197563171, |
|
"eval_runtime": 304.225, |
|
"eval_samples_per_second": 133.46, |
|
"eval_steps_per_second": 4.171, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 5.82670688236907, |
|
"grad_norm": 3.0054922103881836, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.8743, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 5.82670688236907, |
|
"eval_accuracy": 0.8284311134181968, |
|
"eval_loss": 0.9539070725440979, |
|
"eval_runtime": 304.8553, |
|
"eval_samples_per_second": 133.185, |
|
"eval_steps_per_second": 4.163, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 5.895256375102824, |
|
"grad_norm": 2.29243540763855, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.8667, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.895256375102824, |
|
"eval_accuracy": 0.8291235685160401, |
|
"eval_loss": 0.9470139145851135, |
|
"eval_runtime": 298.7685, |
|
"eval_samples_per_second": 135.898, |
|
"eval_steps_per_second": 4.247, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 5.963805867836578, |
|
"grad_norm": 2.5743372440338135, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.8681, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 5.963805867836578, |
|
"eval_accuracy": 0.8291158725629887, |
|
"eval_loss": 0.946834921836853, |
|
"eval_runtime": 284.2736, |
|
"eval_samples_per_second": 142.827, |
|
"eval_steps_per_second": 4.464, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 6.032355360570332, |
|
"grad_norm": 2.33494234085083, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.8594, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 6.032355360570332, |
|
"eval_accuracy": 0.8301663716691428, |
|
"eval_loss": 0.9472524523735046, |
|
"eval_runtime": 261.9172, |
|
"eval_samples_per_second": 155.018, |
|
"eval_steps_per_second": 4.845, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 6.100904853304086, |
|
"grad_norm": 2.7616426944732666, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.8517, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 6.100904853304086, |
|
"eval_accuracy": 0.8304027916380742, |
|
"eval_loss": 0.9408496022224426, |
|
"eval_runtime": 257.341, |
|
"eval_samples_per_second": 157.775, |
|
"eval_steps_per_second": 4.931, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 6.169454346037839, |
|
"grad_norm": 2.6394338607788086, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.8453, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 6.169454346037839, |
|
"eval_accuracy": 0.8302381896975964, |
|
"eval_loss": 0.945652425289154, |
|
"eval_runtime": 257.4862, |
|
"eval_samples_per_second": 157.686, |
|
"eval_steps_per_second": 4.928, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 6.2380038387715935, |
|
"grad_norm": 2.6004316806793213, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.8486, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 6.2380038387715935, |
|
"eval_accuracy": 0.8311095745962099, |
|
"eval_loss": 0.940719485282898, |
|
"eval_runtime": 258.4002, |
|
"eval_samples_per_second": 157.128, |
|
"eval_steps_per_second": 4.911, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 6.306553331505347, |
|
"grad_norm": 2.722169876098633, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.8469, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 6.306553331505347, |
|
"eval_accuracy": 0.8303636681664364, |
|
"eval_loss": 0.9365447759628296, |
|
"eval_runtime": 257.5562, |
|
"eval_samples_per_second": 157.643, |
|
"eval_steps_per_second": 4.927, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 6.375102824239101, |
|
"grad_norm": 2.955397367477417, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.8434, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 6.375102824239101, |
|
"eval_accuracy": 0.8311330928241353, |
|
"eval_loss": 0.9382375478744507, |
|
"eval_runtime": 258.6675, |
|
"eval_samples_per_second": 156.966, |
|
"eval_steps_per_second": 4.906, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 6.443652316972854, |
|
"grad_norm": 2.375140428543091, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.8343, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 6.443652316972854, |
|
"eval_accuracy": 0.8315033650181133, |
|
"eval_loss": 0.934901773929596, |
|
"eval_runtime": 257.5199, |
|
"eval_samples_per_second": 157.665, |
|
"eval_steps_per_second": 4.928, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 6.512201809706609, |
|
"grad_norm": 2.4617624282836914, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.8312, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 6.512201809706609, |
|
"eval_accuracy": 0.8317886916287004, |
|
"eval_loss": 0.9246230721473694, |
|
"eval_runtime": 258.5693, |
|
"eval_samples_per_second": 157.026, |
|
"eval_steps_per_second": 4.908, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 6.580751302440362, |
|
"grad_norm": 2.4794909954071045, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.8365, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 6.580751302440362, |
|
"eval_accuracy": 0.8332747655954476, |
|
"eval_loss": 0.9223575592041016, |
|
"eval_runtime": 257.532, |
|
"eval_samples_per_second": 157.658, |
|
"eval_steps_per_second": 4.928, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 6.649300795174116, |
|
"grad_norm": 2.893775224685669, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.8307, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 6.649300795174116, |
|
"eval_accuracy": 0.8332412930945546, |
|
"eval_loss": 0.9224662184715271, |
|
"eval_runtime": 257.3988, |
|
"eval_samples_per_second": 157.74, |
|
"eval_steps_per_second": 4.93, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 6.717850287907869, |
|
"grad_norm": 2.4228713512420654, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.838, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 6.717850287907869, |
|
"eval_accuracy": 0.8337124591782815, |
|
"eval_loss": 0.9226129055023193, |
|
"eval_runtime": 258.4175, |
|
"eval_samples_per_second": 157.118, |
|
"eval_steps_per_second": 4.911, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 6.786399780641624, |
|
"grad_norm": 2.462571144104004, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.8355, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 6.786399780641624, |
|
"eval_accuracy": 0.8337023695286286, |
|
"eval_loss": 0.9333141446113586, |
|
"eval_runtime": 257.7508, |
|
"eval_samples_per_second": 157.524, |
|
"eval_steps_per_second": 4.923, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 6.854949273375377, |
|
"grad_norm": 2.5558559894561768, |
|
"learning_rate": 5e-05, |
|
"loss": 0.8391, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 6.854949273375377, |
|
"eval_accuracy": 0.8339626635022332, |
|
"eval_loss": 0.9166584610939026, |
|
"eval_runtime": 258.4946, |
|
"eval_samples_per_second": 157.071, |
|
"eval_steps_per_second": 4.909, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 6.923498766109131, |
|
"grad_norm": 2.733778953552246, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.834, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 6.923498766109131, |
|
"eval_accuracy": 0.8344111438621534, |
|
"eval_loss": 0.9235773086547852, |
|
"eval_runtime": 258.6401, |
|
"eval_samples_per_second": 156.983, |
|
"eval_steps_per_second": 4.906, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 6.9920482588428845, |
|
"grad_norm": 2.802053451538086, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.8269, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 6.9920482588428845, |
|
"eval_accuracy": 0.8348735131396657, |
|
"eval_loss": 0.920405924320221, |
|
"eval_runtime": 257.5552, |
|
"eval_samples_per_second": 157.644, |
|
"eval_steps_per_second": 4.927, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 7.060597751576639, |
|
"grad_norm": 2.666555404663086, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.8085, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 7.060597751576639, |
|
"eval_accuracy": 0.8347147667297783, |
|
"eval_loss": 0.9154396057128906, |
|
"eval_runtime": 258.4259, |
|
"eval_samples_per_second": 157.113, |
|
"eval_steps_per_second": 4.91, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 7.129147244310392, |
|
"grad_norm": 2.4190189838409424, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.819, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 7.129147244310392, |
|
"eval_accuracy": 0.8344749377257509, |
|
"eval_loss": 0.9256834983825684, |
|
"eval_runtime": 258.4604, |
|
"eval_samples_per_second": 157.092, |
|
"eval_steps_per_second": 4.91, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 7.197696737044146, |
|
"grad_norm": 2.8802294731140137, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.8238, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 7.197696737044146, |
|
"eval_accuracy": 0.8351059340465287, |
|
"eval_loss": 0.9185708165168762, |
|
"eval_runtime": 258.4374, |
|
"eval_samples_per_second": 157.106, |
|
"eval_steps_per_second": 4.91, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 7.2662462297779, |
|
"grad_norm": 2.5864334106445312, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.8065, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 7.2662462297779, |
|
"eval_accuracy": 0.8358540639956455, |
|
"eval_loss": 0.9095313549041748, |
|
"eval_runtime": 258.6881, |
|
"eval_samples_per_second": 156.953, |
|
"eval_steps_per_second": 4.906, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 7.334795722511654, |
|
"grad_norm": 3.0811657905578613, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.8199, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 7.334795722511654, |
|
"eval_accuracy": 0.8364544665607716, |
|
"eval_loss": 0.9065914154052734, |
|
"eval_runtime": 258.6721, |
|
"eval_samples_per_second": 156.963, |
|
"eval_steps_per_second": 4.906, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 7.403345215245407, |
|
"grad_norm": 2.3102753162384033, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.8018, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 7.403345215245407, |
|
"eval_accuracy": 0.8359817384838574, |
|
"eval_loss": 0.9075337052345276, |
|
"eval_runtime": 257.5046, |
|
"eval_samples_per_second": 157.675, |
|
"eval_steps_per_second": 4.928, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 7.471894707979161, |
|
"grad_norm": 2.269843578338623, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.8102, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 7.471894707979161, |
|
"eval_accuracy": 0.8372314644838865, |
|
"eval_loss": 0.9001559019088745, |
|
"eval_runtime": 258.4508, |
|
"eval_samples_per_second": 157.098, |
|
"eval_steps_per_second": 4.91, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 7.540444200712915, |
|
"grad_norm": 2.617309093475342, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.8194, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 7.540444200712915, |
|
"eval_accuracy": 0.8369146258637555, |
|
"eval_loss": 0.899241030216217, |
|
"eval_runtime": 257.4931, |
|
"eval_samples_per_second": 157.682, |
|
"eval_steps_per_second": 4.928, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 7.608993693446669, |
|
"grad_norm": 2.4634625911712646, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.8138, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 7.608993693446669, |
|
"eval_accuracy": 0.8369996392235467, |
|
"eval_loss": 0.9033562541007996, |
|
"eval_runtime": 258.54, |
|
"eval_samples_per_second": 157.043, |
|
"eval_steps_per_second": 4.908, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 7.677543186180422, |
|
"grad_norm": 2.850604772567749, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.8077, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 7.677543186180422, |
|
"eval_accuracy": 0.8378181332713783, |
|
"eval_loss": 0.9002473950386047, |
|
"eval_runtime": 257.5581, |
|
"eval_samples_per_second": 157.642, |
|
"eval_steps_per_second": 4.927, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 7.746092678914176, |
|
"grad_norm": 2.3677656650543213, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.8119, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 7.746092678914176, |
|
"eval_accuracy": 0.8382744553424182, |
|
"eval_loss": 0.8992937803268433, |
|
"eval_runtime": 258.5324, |
|
"eval_samples_per_second": 157.048, |
|
"eval_steps_per_second": 4.908, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 7.81464217164793, |
|
"grad_norm": 2.0961389541625977, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.8029, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 7.81464217164793, |
|
"eval_accuracy": 0.8377691746192005, |
|
"eval_loss": 0.8913019895553589, |
|
"eval_runtime": 258.5294, |
|
"eval_samples_per_second": 157.05, |
|
"eval_steps_per_second": 4.909, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 7.883191664381684, |
|
"grad_norm": 2.424496650695801, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.802, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 7.883191664381684, |
|
"eval_accuracy": 0.8387740254965514, |
|
"eval_loss": 0.8913179039955139, |
|
"eval_runtime": 258.6018, |
|
"eval_samples_per_second": 157.006, |
|
"eval_steps_per_second": 4.907, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 7.951741157115437, |
|
"grad_norm": 2.8273098468780518, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.7887, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 7.951741157115437, |
|
"eval_accuracy": 0.8389023572318363, |
|
"eval_loss": 0.8917869329452515, |
|
"eval_runtime": 259.2663, |
|
"eval_samples_per_second": 156.603, |
|
"eval_steps_per_second": 4.895, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 8.02029064984919, |
|
"grad_norm": 2.5863022804260254, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.7902, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 8.02029064984919, |
|
"eval_accuracy": 0.8385434985627828, |
|
"eval_loss": 0.8866747617721558, |
|
"eval_runtime": 257.9905, |
|
"eval_samples_per_second": 157.378, |
|
"eval_steps_per_second": 4.919, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 8.088840142582946, |
|
"grad_norm": 2.357172727584839, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.7892, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 8.088840142582946, |
|
"eval_accuracy": 0.8392591622264518, |
|
"eval_loss": 0.8825114369392395, |
|
"eval_runtime": 258.6746, |
|
"eval_samples_per_second": 156.962, |
|
"eval_steps_per_second": 4.906, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 8.157389635316699, |
|
"grad_norm": 2.3970017433166504, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.7928, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 8.157389635316699, |
|
"eval_accuracy": 0.8398791985985096, |
|
"eval_loss": 0.8858514428138733, |
|
"eval_runtime": 258.8038, |
|
"eval_samples_per_second": 156.883, |
|
"eval_steps_per_second": 4.903, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 8.225939128050452, |
|
"grad_norm": 2.1816744804382324, |
|
"learning_rate": 4e-05, |
|
"loss": 0.786, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 8.225939128050452, |
|
"eval_accuracy": 0.8399906549712216, |
|
"eval_loss": 0.8871041536331177, |
|
"eval_runtime": 257.814, |
|
"eval_samples_per_second": 157.486, |
|
"eval_steps_per_second": 4.922, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 8.294488620784206, |
|
"grad_norm": 2.6891512870788574, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.7838, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 8.294488620784206, |
|
"eval_accuracy": 0.8402320722657605, |
|
"eval_loss": 0.8783635497093201, |
|
"eval_runtime": 257.6277, |
|
"eval_samples_per_second": 157.6, |
|
"eval_steps_per_second": 4.926, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 8.36303811351796, |
|
"grad_norm": 2.2459070682525635, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.7857, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 8.36303811351796, |
|
"eval_accuracy": 0.8401854172212088, |
|
"eval_loss": 0.8782520294189453, |
|
"eval_runtime": 258.8529, |
|
"eval_samples_per_second": 156.854, |
|
"eval_steps_per_second": 4.902, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 8.431587606251714, |
|
"grad_norm": 2.3516621589660645, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.7807, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 8.431587606251714, |
|
"eval_accuracy": 0.8408464558399127, |
|
"eval_loss": 0.8879706263542175, |
|
"eval_runtime": 258.721, |
|
"eval_samples_per_second": 156.934, |
|
"eval_steps_per_second": 4.905, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 8.500137098985467, |
|
"grad_norm": 2.408498764038086, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.7869, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 8.500137098985467, |
|
"eval_accuracy": 0.8411305726347277, |
|
"eval_loss": 0.8754673004150391, |
|
"eval_runtime": 258.7894, |
|
"eval_samples_per_second": 156.892, |
|
"eval_steps_per_second": 4.904, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 8.56868659171922, |
|
"grad_norm": 2.7398715019226074, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.7768, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 8.56868659171922, |
|
"eval_accuracy": 0.8415116048695496, |
|
"eval_loss": 0.8766404390335083, |
|
"eval_runtime": 258.7705, |
|
"eval_samples_per_second": 156.903, |
|
"eval_steps_per_second": 4.904, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 8.637236084452976, |
|
"grad_norm": 2.7151975631713867, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.7806, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 8.637236084452976, |
|
"eval_accuracy": 0.8416668834829366, |
|
"eval_loss": 0.8836163282394409, |
|
"eval_runtime": 257.8541, |
|
"eval_samples_per_second": 157.461, |
|
"eval_steps_per_second": 4.921, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 8.70578557718673, |
|
"grad_norm": 2.50140380859375, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.7811, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 8.70578557718673, |
|
"eval_accuracy": 0.8423243126907809, |
|
"eval_loss": 0.8704027533531189, |
|
"eval_runtime": 256.7585, |
|
"eval_samples_per_second": 158.133, |
|
"eval_steps_per_second": 4.942, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 8.774335069920483, |
|
"grad_norm": 2.9045302867889404, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.7733, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 8.774335069920483, |
|
"eval_accuracy": 0.8424119193113302, |
|
"eval_loss": 0.8675287365913391, |
|
"eval_runtime": 258.0076, |
|
"eval_samples_per_second": 157.367, |
|
"eval_steps_per_second": 4.918, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 8.842884562654236, |
|
"grad_norm": 2.8266477584838867, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.7728, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 8.842884562654236, |
|
"eval_accuracy": 0.8425746458723948, |
|
"eval_loss": 0.8766723871231079, |
|
"eval_runtime": 257.1101, |
|
"eval_samples_per_second": 157.917, |
|
"eval_steps_per_second": 4.936, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 8.911434055387991, |
|
"grad_norm": 2.3678436279296875, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.7779, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 8.911434055387991, |
|
"eval_accuracy": 0.842695820921136, |
|
"eval_loss": 0.8741580843925476, |
|
"eval_runtime": 258.4977, |
|
"eval_samples_per_second": 157.069, |
|
"eval_steps_per_second": 4.909, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 8.979983548121744, |
|
"grad_norm": 2.4681596755981445, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.7779, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 8.979983548121744, |
|
"eval_accuracy": 0.8435885693668189, |
|
"eval_loss": 0.8590840697288513, |
|
"eval_runtime": 258.179, |
|
"eval_samples_per_second": 157.263, |
|
"eval_steps_per_second": 4.915, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 9.048533040855498, |
|
"grad_norm": 2.4200708866119385, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.7704, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 9.048533040855498, |
|
"eval_accuracy": 0.8437169036285643, |
|
"eval_loss": 0.8766728639602661, |
|
"eval_runtime": 258.0969, |
|
"eval_samples_per_second": 157.313, |
|
"eval_steps_per_second": 4.917, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 9.117082533589251, |
|
"grad_norm": 2.753324270248413, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.7695, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 9.117082533589251, |
|
"eval_accuracy": 0.8435838175840091, |
|
"eval_loss": 0.881564199924469, |
|
"eval_runtime": 259.2788, |
|
"eval_samples_per_second": 156.596, |
|
"eval_steps_per_second": 4.894, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 9.185632026323006, |
|
"grad_norm": 2.490852117538452, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.7617, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 9.185632026323006, |
|
"eval_accuracy": 0.8437435512372087, |
|
"eval_loss": 0.8751281499862671, |
|
"eval_runtime": 256.1107, |
|
"eval_samples_per_second": 158.533, |
|
"eval_steps_per_second": 4.955, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 9.25418151905676, |
|
"grad_norm": 2.581777334213257, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.7585, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 9.25418151905676, |
|
"eval_accuracy": 0.8439763030486503, |
|
"eval_loss": 0.8657551407814026, |
|
"eval_runtime": 257.7079, |
|
"eval_samples_per_second": 157.55, |
|
"eval_steps_per_second": 4.924, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 9.322731011790513, |
|
"grad_norm": 2.997283458709717, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.7657, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 9.322731011790513, |
|
"eval_accuracy": 0.8446350018812996, |
|
"eval_loss": 0.8639153838157654, |
|
"eval_runtime": 256.5945, |
|
"eval_samples_per_second": 158.234, |
|
"eval_steps_per_second": 4.946, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 9.391280504524266, |
|
"grad_norm": 2.7763564586639404, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.759, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 9.391280504524266, |
|
"eval_accuracy": 0.8448020556501544, |
|
"eval_loss": 0.8533274531364441, |
|
"eval_runtime": 256.963, |
|
"eval_samples_per_second": 158.007, |
|
"eval_steps_per_second": 4.938, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 9.459829997258021, |
|
"grad_norm": 2.864605665206909, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.7574, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 9.459829997258021, |
|
"eval_accuracy": 0.8444638910956694, |
|
"eval_loss": 0.8697899580001831, |
|
"eval_runtime": 255.8551, |
|
"eval_samples_per_second": 158.691, |
|
"eval_steps_per_second": 4.96, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 9.528379489991774, |
|
"grad_norm": 2.5367231369018555, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.7529, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 9.528379489991774, |
|
"eval_accuracy": 0.8451616733099838, |
|
"eval_loss": 0.8582028746604919, |
|
"eval_runtime": 256.6803, |
|
"eval_samples_per_second": 158.181, |
|
"eval_steps_per_second": 4.944, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 9.596928982725528, |
|
"grad_norm": 2.1710877418518066, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7577, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 9.596928982725528, |
|
"eval_accuracy": 0.845897620114414, |
|
"eval_loss": 0.8616137504577637, |
|
"eval_runtime": 255.6697, |
|
"eval_samples_per_second": 158.806, |
|
"eval_steps_per_second": 4.963, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 9.665478475459281, |
|
"grad_norm": 2.400867462158203, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.7554, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 9.665478475459281, |
|
"eval_accuracy": 0.8460372178025299, |
|
"eval_loss": 0.8466119766235352, |
|
"eval_runtime": 256.0692, |
|
"eval_samples_per_second": 158.559, |
|
"eval_steps_per_second": 4.956, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 9.734027968193036, |
|
"grad_norm": 2.6465237140655518, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.7406, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 9.734027968193036, |
|
"eval_accuracy": 0.846062622809469, |
|
"eval_loss": 0.8593913316726685, |
|
"eval_runtime": 257.0976, |
|
"eval_samples_per_second": 157.924, |
|
"eval_steps_per_second": 4.936, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 9.80257746092679, |
|
"grad_norm": 2.72021222114563, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.7543, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 9.80257746092679, |
|
"eval_accuracy": 0.8463149479101548, |
|
"eval_loss": 0.8515172600746155, |
|
"eval_runtime": 256.7199, |
|
"eval_samples_per_second": 158.157, |
|
"eval_steps_per_second": 4.943, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 9.871126953660543, |
|
"grad_norm": 2.639139175415039, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.7506, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 9.871126953660543, |
|
"eval_accuracy": 0.8465278129829884, |
|
"eval_loss": 0.8525589108467102, |
|
"eval_runtime": 258.3778, |
|
"eval_samples_per_second": 157.142, |
|
"eval_steps_per_second": 4.911, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 9.939676446394296, |
|
"grad_norm": 2.2242422103881836, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.7517, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 9.939676446394296, |
|
"eval_accuracy": 0.8470482563336492, |
|
"eval_loss": 0.8529332876205444, |
|
"eval_runtime": 257.3598, |
|
"eval_samples_per_second": 157.764, |
|
"eval_steps_per_second": 4.931, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 10.008225939128051, |
|
"grad_norm": 2.333671808242798, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.7425, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 10.008225939128051, |
|
"eval_accuracy": 0.8467576970668704, |
|
"eval_loss": 0.8543536067008972, |
|
"eval_runtime": 257.3323, |
|
"eval_samples_per_second": 157.78, |
|
"eval_steps_per_second": 4.931, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 10.076775431861805, |
|
"grad_norm": 2.4138877391815186, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.7464, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 10.076775431861805, |
|
"eval_accuracy": 0.8473971965940057, |
|
"eval_loss": 0.8455188870429993, |
|
"eval_runtime": 257.2349, |
|
"eval_samples_per_second": 157.84, |
|
"eval_steps_per_second": 4.933, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 10.145324924595558, |
|
"grad_norm": 2.812563180923462, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.7412, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 10.145324924595558, |
|
"eval_accuracy": 0.8475976099587939, |
|
"eval_loss": 0.8453831076622009, |
|
"eval_runtime": 256.9915, |
|
"eval_samples_per_second": 157.99, |
|
"eval_steps_per_second": 4.938, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 10.213874417329311, |
|
"grad_norm": 2.369260549545288, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.7346, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 10.213874417329311, |
|
"eval_accuracy": 0.8474954397549382, |
|
"eval_loss": 0.8512648344039917, |
|
"eval_runtime": 258.0335, |
|
"eval_samples_per_second": 157.352, |
|
"eval_steps_per_second": 4.918, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 10.282423910063066, |
|
"grad_norm": 2.7622134685516357, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.7424, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 10.282423910063066, |
|
"eval_accuracy": 0.8481833714959756, |
|
"eval_loss": 0.8343672156333923, |
|
"eval_runtime": 258.2629, |
|
"eval_samples_per_second": 157.212, |
|
"eval_steps_per_second": 4.914, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 10.35097340279682, |
|
"grad_norm": 2.2065768241882324, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.7364, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 10.35097340279682, |
|
"eval_accuracy": 0.8482113108532771, |
|
"eval_loss": 0.8340145945549011, |
|
"eval_runtime": 258.3206, |
|
"eval_samples_per_second": 157.177, |
|
"eval_steps_per_second": 4.913, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 10.419522895530573, |
|
"grad_norm": 2.186100721359253, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.7409, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 10.419522895530573, |
|
"eval_accuracy": 0.8489686223957396, |
|
"eval_loss": 0.8362465500831604, |
|
"eval_runtime": 256.9187, |
|
"eval_samples_per_second": 158.034, |
|
"eval_steps_per_second": 4.939, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 10.488072388264326, |
|
"grad_norm": 2.706817626953125, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.7353, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 10.488072388264326, |
|
"eval_accuracy": 0.8487501900476949, |
|
"eval_loss": 0.8368015289306641, |
|
"eval_runtime": 257.1636, |
|
"eval_samples_per_second": 157.884, |
|
"eval_steps_per_second": 4.935, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 10.556621880998081, |
|
"grad_norm": 2.523261308670044, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.731, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 10.556621880998081, |
|
"eval_accuracy": 0.8489030226241834, |
|
"eval_loss": 0.8337299823760986, |
|
"eval_runtime": 256.9915, |
|
"eval_samples_per_second": 157.99, |
|
"eval_steps_per_second": 4.938, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 10.625171373731835, |
|
"grad_norm": 2.606250286102295, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.7292, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 10.625171373731835, |
|
"eval_accuracy": 0.8478736538649552, |
|
"eval_loss": 0.8499141335487366, |
|
"eval_runtime": 258.2456, |
|
"eval_samples_per_second": 157.222, |
|
"eval_steps_per_second": 4.914, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 10.693720866465588, |
|
"grad_norm": 2.5361220836639404, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.7359, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 10.693720866465588, |
|
"eval_accuracy": 0.8490765860082904, |
|
"eval_loss": 0.8316646218299866, |
|
"eval_runtime": 258.3157, |
|
"eval_samples_per_second": 157.18, |
|
"eval_steps_per_second": 4.913, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 10.762270359199341, |
|
"grad_norm": 2.3277316093444824, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.7284, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 10.762270359199341, |
|
"eval_accuracy": 0.8495720225312002, |
|
"eval_loss": 0.8365707397460938, |
|
"eval_runtime": 258.2814, |
|
"eval_samples_per_second": 157.201, |
|
"eval_steps_per_second": 4.913, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 10.830819851933096, |
|
"grad_norm": 2.746189832687378, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.7316, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 10.830819851933096, |
|
"eval_accuracy": 0.8500147906280335, |
|
"eval_loss": 0.8251886963844299, |
|
"eval_runtime": 257.4381, |
|
"eval_samples_per_second": 157.716, |
|
"eval_steps_per_second": 4.929, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 10.89936934466685, |
|
"grad_norm": 2.9917354583740234, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.7304, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 10.89936934466685, |
|
"eval_accuracy": 0.8502983624757147, |
|
"eval_loss": 0.8259178996086121, |
|
"eval_runtime": 257.2855, |
|
"eval_samples_per_second": 157.809, |
|
"eval_steps_per_second": 4.932, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 10.967918837400603, |
|
"grad_norm": 2.275324583053589, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7255, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 10.967918837400603, |
|
"eval_accuracy": 0.8505023638996737, |
|
"eval_loss": 0.8250493407249451, |
|
"eval_runtime": 257.2933, |
|
"eval_samples_per_second": 157.804, |
|
"eval_steps_per_second": 4.932, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 11.036468330134356, |
|
"grad_norm": 2.6113440990448, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.7224, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 11.036468330134356, |
|
"eval_accuracy": 0.8507015443771133, |
|
"eval_loss": 0.8299734592437744, |
|
"eval_runtime": 257.4676, |
|
"eval_samples_per_second": 157.698, |
|
"eval_steps_per_second": 4.929, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 11.105017822868112, |
|
"grad_norm": 2.6536693572998047, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.7208, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 11.105017822868112, |
|
"eval_accuracy": 0.8506463117116102, |
|
"eval_loss": 0.8155694007873535, |
|
"eval_runtime": 256.8592, |
|
"eval_samples_per_second": 158.071, |
|
"eval_steps_per_second": 4.94, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 11.173567315601865, |
|
"grad_norm": 2.290782928466797, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.7148, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 11.173567315601865, |
|
"eval_accuracy": 0.8507982915676063, |
|
"eval_loss": 0.8275089859962463, |
|
"eval_runtime": 257.1589, |
|
"eval_samples_per_second": 157.887, |
|
"eval_steps_per_second": 4.935, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 11.242116808335618, |
|
"grad_norm": 2.6533780097961426, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.7193, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 11.242116808335618, |
|
"eval_accuracy": 0.8510864940426184, |
|
"eval_loss": 0.8217721581459045, |
|
"eval_runtime": 258.1645, |
|
"eval_samples_per_second": 157.272, |
|
"eval_steps_per_second": 4.915, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 11.310666301069372, |
|
"grad_norm": 2.6084372997283936, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.7177, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 11.310666301069372, |
|
"eval_accuracy": 0.8508369557490207, |
|
"eval_loss": 0.8289022445678711, |
|
"eval_runtime": 258.043, |
|
"eval_samples_per_second": 157.346, |
|
"eval_steps_per_second": 4.918, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 11.379215793803127, |
|
"grad_norm": 2.2717843055725098, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.7211, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 11.379215793803127, |
|
"eval_accuracy": 0.8514459749572929, |
|
"eval_loss": 0.8198857307434082, |
|
"eval_runtime": 258.3084, |
|
"eval_samples_per_second": 157.184, |
|
"eval_steps_per_second": 4.913, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 11.44776528653688, |
|
"grad_norm": 2.34387469291687, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.7093, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 11.44776528653688, |
|
"eval_accuracy": 0.8511664827348943, |
|
"eval_loss": 0.8272643089294434, |
|
"eval_runtime": 258.1866, |
|
"eval_samples_per_second": 157.258, |
|
"eval_steps_per_second": 4.915, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 11.516314779270633, |
|
"grad_norm": 2.3854498863220215, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.7154, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 11.516314779270633, |
|
"eval_accuracy": 0.8518011662252298, |
|
"eval_loss": 0.8211445212364197, |
|
"eval_runtime": 257.2815, |
|
"eval_samples_per_second": 157.812, |
|
"eval_steps_per_second": 4.932, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 11.584864272004387, |
|
"grad_norm": 2.4457602500915527, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.7178, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 11.584864272004387, |
|
"eval_accuracy": 0.8521036808235916, |
|
"eval_loss": 0.8183203339576721, |
|
"eval_runtime": 258.3778, |
|
"eval_samples_per_second": 157.142, |
|
"eval_steps_per_second": 4.911, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 11.653413764738142, |
|
"grad_norm": 2.5457184314727783, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.716, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 11.653413764738142, |
|
"eval_accuracy": 0.8522289156626506, |
|
"eval_loss": 0.8176619410514832, |
|
"eval_runtime": 256.9844, |
|
"eval_samples_per_second": 157.994, |
|
"eval_steps_per_second": 4.938, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 11.721963257471895, |
|
"grad_norm": 2.855541467666626, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.7081, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 11.721963257471895, |
|
"eval_accuracy": 0.8522235192962014, |
|
"eval_loss": 0.8102879524230957, |
|
"eval_runtime": 257.0844, |
|
"eval_samples_per_second": 157.933, |
|
"eval_steps_per_second": 4.936, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 11.790512750205648, |
|
"grad_norm": 2.44242787361145, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.7112, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 11.790512750205648, |
|
"eval_accuracy": 0.8531087746062566, |
|
"eval_loss": 0.8167855739593506, |
|
"eval_runtime": 258.2921, |
|
"eval_samples_per_second": 157.194, |
|
"eval_steps_per_second": 4.913, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 11.859062242939402, |
|
"grad_norm": 2.559410333633423, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.7089, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 11.859062242939402, |
|
"eval_accuracy": 0.8524401167338927, |
|
"eval_loss": 0.8145312666893005, |
|
"eval_runtime": 258.2263, |
|
"eval_samples_per_second": 157.234, |
|
"eval_steps_per_second": 4.914, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 11.927611735673157, |
|
"grad_norm": 2.3273496627807617, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.7033, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 11.927611735673157, |
|
"eval_accuracy": 0.8526681918388697, |
|
"eval_loss": 0.8153809309005737, |
|
"eval_runtime": 257.196, |
|
"eval_samples_per_second": 157.864, |
|
"eval_steps_per_second": 4.934, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 11.99616122840691, |
|
"grad_norm": 2.354360580444336, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.7029, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 11.99616122840691, |
|
"eval_accuracy": 0.8531567384453895, |
|
"eval_loss": 0.8229334354400635, |
|
"eval_runtime": 257.3002, |
|
"eval_samples_per_second": 157.8, |
|
"eval_steps_per_second": 4.932, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 12.064710721140663, |
|
"grad_norm": 2.4728591442108154, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.7058, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 12.064710721140663, |
|
"eval_accuracy": 0.8537915504584094, |
|
"eval_loss": 0.8140564560890198, |
|
"eval_runtime": 258.2835, |
|
"eval_samples_per_second": 157.199, |
|
"eval_steps_per_second": 4.913, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 12.133260213874417, |
|
"grad_norm": 2.485384702682495, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.7005, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 12.133260213874417, |
|
"eval_accuracy": 0.8534658761375271, |
|
"eval_loss": 0.8151687383651733, |
|
"eval_runtime": 258.2434, |
|
"eval_samples_per_second": 157.224, |
|
"eval_steps_per_second": 4.914, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 12.201809706608172, |
|
"grad_norm": 2.530062198638916, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.6992, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 12.201809706608172, |
|
"eval_accuracy": 0.853826029943314, |
|
"eval_loss": 0.8016021847724915, |
|
"eval_runtime": 258.2461, |
|
"eval_samples_per_second": 157.222, |
|
"eval_steps_per_second": 4.914, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 12.270359199341925, |
|
"grad_norm": 2.5869436264038086, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.7008, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 12.270359199341925, |
|
"eval_accuracy": 0.8535235114498525, |
|
"eval_loss": 0.8112274408340454, |
|
"eval_runtime": 257.5365, |
|
"eval_samples_per_second": 157.655, |
|
"eval_steps_per_second": 4.927, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 12.338908692075679, |
|
"grad_norm": 2.8641934394836426, |
|
"learning_rate": 1e-05, |
|
"loss": 0.6979, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 12.338908692075679, |
|
"eval_accuracy": 0.8538098856943305, |
|
"eval_loss": 0.8109295964241028, |
|
"eval_runtime": 258.3955, |
|
"eval_samples_per_second": 157.131, |
|
"eval_steps_per_second": 4.911, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 12.407458184809432, |
|
"grad_norm": 2.686566114425659, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.6949, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 12.407458184809432, |
|
"eval_accuracy": 0.8543918997918827, |
|
"eval_loss": 0.8125308156013489, |
|
"eval_runtime": 258.1617, |
|
"eval_samples_per_second": 157.274, |
|
"eval_steps_per_second": 4.916, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 12.476007677543187, |
|
"grad_norm": 2.452526569366455, |
|
"learning_rate": 9e-06, |
|
"loss": 0.6946, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 12.476007677543187, |
|
"eval_accuracy": 0.8538440239723134, |
|
"eval_loss": 0.8097832798957825, |
|
"eval_runtime": 258.2364, |
|
"eval_samples_per_second": 157.228, |
|
"eval_steps_per_second": 4.914, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 12.54455717027694, |
|
"grad_norm": 2.463740825653076, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.6939, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 12.54455717027694, |
|
"eval_accuracy": 0.854438946024891, |
|
"eval_loss": 0.7999902963638306, |
|
"eval_runtime": 257.2634, |
|
"eval_samples_per_second": 157.823, |
|
"eval_steps_per_second": 4.933, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 12.613106663010694, |
|
"grad_norm": 2.547820568084717, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.6969, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 12.613106663010694, |
|
"eval_accuracy": 0.8542858590534823, |
|
"eval_loss": 0.8070544004440308, |
|
"eval_runtime": 257.3491, |
|
"eval_samples_per_second": 157.77, |
|
"eval_steps_per_second": 4.931, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 12.681656155744447, |
|
"grad_norm": 2.2731072902679443, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.6967, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 12.681656155744447, |
|
"eval_accuracy": 0.8545510749739543, |
|
"eval_loss": 0.7984638810157776, |
|
"eval_runtime": 258.6739, |
|
"eval_samples_per_second": 156.962, |
|
"eval_steps_per_second": 4.906, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 12.750205648478202, |
|
"grad_norm": 2.242337226867676, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.6944, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 12.750205648478202, |
|
"eval_accuracy": 0.85512864037364, |
|
"eval_loss": 0.7989787459373474, |
|
"eval_runtime": 258.5499, |
|
"eval_samples_per_second": 157.037, |
|
"eval_steps_per_second": 4.908, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 12.818755141211955, |
|
"grad_norm": 2.4914486408233643, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.6885, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 12.818755141211955, |
|
"eval_accuracy": 0.8550956587798725, |
|
"eval_loss": 0.8170965313911438, |
|
"eval_runtime": 258.3692, |
|
"eval_samples_per_second": 157.147, |
|
"eval_steps_per_second": 4.912, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 12.887304633945709, |
|
"grad_norm": 2.1909425258636475, |
|
"learning_rate": 6e-06, |
|
"loss": 0.6897, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 12.887304633945709, |
|
"eval_accuracy": 0.8550768677242255, |
|
"eval_loss": 0.8014948964118958, |
|
"eval_runtime": 258.4993, |
|
"eval_samples_per_second": 157.068, |
|
"eval_steps_per_second": 4.909, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 12.955854126679462, |
|
"grad_norm": 2.7882330417633057, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.7027, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 12.955854126679462, |
|
"eval_accuracy": 0.8551748096354626, |
|
"eval_loss": 0.8074929118156433, |
|
"eval_runtime": 258.4421, |
|
"eval_samples_per_second": 157.103, |
|
"eval_steps_per_second": 4.91, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 13.024403619413217, |
|
"grad_norm": 2.189662218093872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6926, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 13.024403619413217, |
|
"eval_accuracy": 0.8554257092266915, |
|
"eval_loss": 0.8118977546691895, |
|
"eval_runtime": 257.8701, |
|
"eval_samples_per_second": 157.451, |
|
"eval_steps_per_second": 4.921, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 13.09295311214697, |
|
"grad_norm": 2.4796793460845947, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.697, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 13.09295311214697, |
|
"eval_accuracy": 0.8558198770391428, |
|
"eval_loss": 0.7951220870018005, |
|
"eval_runtime": 258.6851, |
|
"eval_samples_per_second": 156.955, |
|
"eval_steps_per_second": 4.906, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 13.161502604880724, |
|
"grad_norm": 2.475494146347046, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.6814, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 13.161502604880724, |
|
"eval_accuracy": 0.8557974835152675, |
|
"eval_loss": 0.7992942929267883, |
|
"eval_runtime": 258.6884, |
|
"eval_samples_per_second": 156.953, |
|
"eval_steps_per_second": 4.906, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 13.230052097614477, |
|
"grad_norm": 2.662364959716797, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.687, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 13.230052097614477, |
|
"eval_accuracy": 0.8556396210250248, |
|
"eval_loss": 0.7970269322395325, |
|
"eval_runtime": 257.7318, |
|
"eval_samples_per_second": 157.536, |
|
"eval_steps_per_second": 4.924, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 13.298601590348232, |
|
"grad_norm": 2.5556256771087646, |
|
"learning_rate": 3e-06, |
|
"loss": 0.6956, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 13.298601590348232, |
|
"eval_accuracy": 0.8560198209787908, |
|
"eval_loss": 0.7952587008476257, |
|
"eval_runtime": 258.556, |
|
"eval_samples_per_second": 157.034, |
|
"eval_steps_per_second": 4.908, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 13.367151083081986, |
|
"grad_norm": 2.327164888381958, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6821, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 13.367151083081986, |
|
"eval_accuracy": 0.855291832818916, |
|
"eval_loss": 0.8007811307907104, |
|
"eval_runtime": 257.5846, |
|
"eval_samples_per_second": 157.626, |
|
"eval_steps_per_second": 4.927, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 13.435700575815739, |
|
"grad_norm": 2.408548593521118, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.6846, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 13.435700575815739, |
|
"eval_accuracy": 0.8561709926629, |
|
"eval_loss": 0.7897204756736755, |
|
"eval_runtime": 258.6709, |
|
"eval_samples_per_second": 156.964, |
|
"eval_steps_per_second": 4.906, |
|
"step": 98000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 14, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.262679236758733e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|