{ "best_metric": 0.8943121800635999, "best_model_checkpoint": "/p/scratch/ccstdl/krishna/finetuned-cosine-loss/checkpoint-670", "epoch": 0.2726902726902727, "eval_steps": 10, "global_step": 670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00407000407000407, "grad_norm": 1.8263583183288574, "learning_rate": 3.391670058336725e-09, "loss": 0.9055, "step": 10 }, { "epoch": 0.00407000407000407, "eval_cos_sim": 0.08646486699581146, "eval_loss": 0.9145843000625319, "eval_runtime": 90.5735, "eval_samples_per_second": 11.041, "eval_steps_per_second": 0.353, "step": 10 }, { "epoch": 0.00814000814000814, "grad_norm": 1.5456867218017578, "learning_rate": 6.78334011667345e-09, "loss": 0.9069, "step": 20 }, { "epoch": 0.00814000814000814, "eval_cos_sim": 0.08647813647985458, "eval_loss": 0.9145710382674879, "eval_runtime": 87.7459, "eval_samples_per_second": 11.397, "eval_steps_per_second": 0.365, "step": 20 }, { "epoch": 0.01221001221001221, "grad_norm": 1.8042848110198975, "learning_rate": 1.0175010175010176e-08, "loss": 0.9104, "step": 30 }, { "epoch": 0.01221001221001221, "eval_cos_sim": 0.08650018274784088, "eval_loss": 0.9145490122054761, "eval_runtime": 87.7361, "eval_samples_per_second": 11.398, "eval_steps_per_second": 0.365, "step": 30 }, { "epoch": 0.01628001628001628, "grad_norm": 1.8247275352478027, "learning_rate": 1.35666802333469e-08, "loss": 0.9074, "step": 40 }, { "epoch": 0.01628001628001628, "eval_cos_sim": 0.08653160184621811, "eval_loss": 0.9145176182006544, "eval_runtime": 88.195, "eval_samples_per_second": 11.339, "eval_steps_per_second": 0.363, "step": 40 }, { "epoch": 0.02035002035002035, "grad_norm": 1.7969046831130981, "learning_rate": 1.6958350291683625e-08, "loss": 0.9065, "step": 50 }, { "epoch": 0.02035002035002035, "eval_cos_sim": 0.08657178282737732, "eval_loss": 0.9144774656509107, "eval_runtime": 88.304, "eval_samples_per_second": 11.325, "eval_steps_per_second": 0.362, "step": 50 }, { "epoch": 0.02442002442002442, "grad_norm": 1.541685700416565, "learning_rate": 2.035002035002035e-08, "loss": 0.9076, "step": 60 }, { "epoch": 0.02442002442002442, "eval_cos_sim": 0.08662137389183044, "eval_loss": 0.9144279079650587, "eval_runtime": 88.4862, "eval_samples_per_second": 11.301, "eval_steps_per_second": 0.362, "step": 60 }, { "epoch": 0.02849002849002849, "grad_norm": 1.8064115047454834, "learning_rate": 2.3741690408357078e-08, "loss": 0.9062, "step": 70 }, { "epoch": 0.02849002849002849, "eval_cos_sim": 0.08668046444654465, "eval_loss": 0.9143688574050611, "eval_runtime": 88.5586, "eval_samples_per_second": 11.292, "eval_steps_per_second": 0.361, "step": 70 }, { "epoch": 0.03256003256003256, "grad_norm": 1.8134921789169312, "learning_rate": 2.71333604666938e-08, "loss": 0.9093, "step": 80 }, { "epoch": 0.03256003256003256, "eval_cos_sim": 0.08674891293048859, "eval_loss": 0.9143004770492261, "eval_runtime": 88.0037, "eval_samples_per_second": 11.363, "eval_steps_per_second": 0.364, "step": 80 }, { "epoch": 0.03663003663003663, "grad_norm": 1.5352181196212769, "learning_rate": 3.052503052503053e-08, "loss": 0.9023, "step": 90 }, { "epoch": 0.03663003663003663, "eval_cos_sim": 0.08682523667812347, "eval_loss": 0.9142241993163771, "eval_runtime": 88.0698, "eval_samples_per_second": 11.355, "eval_steps_per_second": 0.363, "step": 90 }, { "epoch": 0.0407000407000407, "grad_norm": 1.8028756380081177, "learning_rate": 3.391670058336725e-08, "loss": 0.9023, "step": 100 }, { "epoch": 0.0407000407000407, "eval_cos_sim": 0.08691387623548508, "eval_loss": 0.9141356187080091, "eval_runtime": 88.0817, "eval_samples_per_second": 11.353, "eval_steps_per_second": 0.363, "step": 100 }, { "epoch": 0.04477004477004477, "grad_norm": 1.5299208164215088, "learning_rate": 3.730837064170397e-08, "loss": 0.9047, "step": 110 }, { "epoch": 0.04477004477004477, "eval_cos_sim": 0.08701060712337494, "eval_loss": 0.9140389495109266, "eval_runtime": 87.4523, "eval_samples_per_second": 11.435, "eval_steps_per_second": 0.366, "step": 110 }, { "epoch": 0.04884004884004884, "grad_norm": 1.551101565361023, "learning_rate": 4.07000407000407e-08, "loss": 0.9048, "step": 120 }, { "epoch": 0.04884004884004884, "eval_cos_sim": 0.08711464703083038, "eval_loss": 0.9139349808906263, "eval_runtime": 88.1611, "eval_samples_per_second": 11.343, "eval_steps_per_second": 0.363, "step": 120 }, { "epoch": 0.05291005291005291, "grad_norm": 1.4539356231689453, "learning_rate": 4.4091710758377425e-08, "loss": 0.9083, "step": 130 }, { "epoch": 0.05291005291005291, "eval_cos_sim": 0.08722705394029617, "eval_loss": 0.9138226628516859, "eval_runtime": 88.6736, "eval_samples_per_second": 11.277, "eval_steps_per_second": 0.361, "step": 130 }, { "epoch": 0.05698005698005698, "grad_norm": 1.5327789783477783, "learning_rate": 4.7483380816714155e-08, "loss": 0.9106, "step": 140 }, { "epoch": 0.05698005698005698, "eval_cos_sim": 0.08734691888093948, "eval_loss": 0.9137029032920545, "eval_runtime": 88.0264, "eval_samples_per_second": 11.36, "eval_steps_per_second": 0.364, "step": 140 }, { "epoch": 0.06105006105006105, "grad_norm": 1.5442041158676147, "learning_rate": 5.087505087505087e-08, "loss": 0.9061, "step": 150 }, { "epoch": 0.06105006105006105, "eval_cos_sim": 0.08747801929712296, "eval_loss": 0.9135719056342787, "eval_runtime": 88.2809, "eval_samples_per_second": 11.327, "eval_steps_per_second": 0.362, "step": 150 }, { "epoch": 0.06512006512006512, "grad_norm": 1.8013614416122437, "learning_rate": 5.42667209333876e-08, "loss": 0.9018, "step": 160 }, { "epoch": 0.06512006512006512, "eval_cos_sim": 0.08762170374393463, "eval_loss": 0.9134283290122693, "eval_runtime": 87.5861, "eval_samples_per_second": 11.417, "eval_steps_per_second": 0.365, "step": 160 }, { "epoch": 0.06919006919006919, "grad_norm": 1.539997935295105, "learning_rate": 5.7658390991724324e-08, "loss": 0.9054, "step": 170 }, { "epoch": 0.06919006919006919, "eval_cos_sim": 0.08777440339326859, "eval_loss": 0.913275733969087, "eval_runtime": 88.0735, "eval_samples_per_second": 11.354, "eval_steps_per_second": 0.363, "step": 170 }, { "epoch": 0.07326007326007326, "grad_norm": 1.5391697883605957, "learning_rate": 6.105006105006105e-08, "loss": 0.9053, "step": 180 }, { "epoch": 0.07326007326007326, "eval_cos_sim": 0.08793637156486511, "eval_loss": 0.9131138630126661, "eval_runtime": 88.2145, "eval_samples_per_second": 11.336, "eval_steps_per_second": 0.363, "step": 180 }, { "epoch": 0.07733007733007732, "grad_norm": 1.8431857824325562, "learning_rate": 6.444173110839778e-08, "loss": 0.9051, "step": 190 }, { "epoch": 0.07733007733007732, "eval_cos_sim": 0.08810393512248993, "eval_loss": 0.9129464283202833, "eval_runtime": 88.3103, "eval_samples_per_second": 11.324, "eval_steps_per_second": 0.362, "step": 190 }, { "epoch": 0.0814000814000814, "grad_norm": 1.826797604560852, "learning_rate": 6.78334011667345e-08, "loss": 0.9019, "step": 200 }, { "epoch": 0.0814000814000814, "eval_cos_sim": 0.0882822722196579, "eval_loss": 0.9127682285522168, "eval_runtime": 88.0167, "eval_samples_per_second": 11.361, "eval_steps_per_second": 0.364, "step": 200 }, { "epoch": 0.08547008547008547, "grad_norm": 1.5421515703201294, "learning_rate": 7.122507122507124e-08, "loss": 0.9068, "step": 210 }, { "epoch": 0.08547008547008547, "eval_cos_sim": 0.0884714424610138, "eval_loss": 0.912579200765962, "eval_runtime": 88.5456, "eval_samples_per_second": 11.294, "eval_steps_per_second": 0.361, "step": 210 }, { "epoch": 0.08954008954008955, "grad_norm": 1.8014060258865356, "learning_rate": 7.461674128340795e-08, "loss": 0.9019, "step": 220 }, { "epoch": 0.08954008954008955, "eval_cos_sim": 0.08867119997739792, "eval_loss": 0.912379605314607, "eval_runtime": 88.2292, "eval_samples_per_second": 11.334, "eval_steps_per_second": 0.363, "step": 220 }, { "epoch": 0.0936100936100936, "grad_norm": 1.8206835985183716, "learning_rate": 7.800841134174468e-08, "loss": 0.9024, "step": 230 }, { "epoch": 0.0936100936100936, "eval_cos_sim": 0.08888154476881027, "eval_loss": 0.9121694097732252, "eval_runtime": 88.303, "eval_samples_per_second": 11.325, "eval_steps_per_second": 0.362, "step": 230 }, { "epoch": 0.09768009768009768, "grad_norm": 1.7975927591323853, "learning_rate": 8.14000814000814e-08, "loss": 0.9067, "step": 240 }, { "epoch": 0.09768009768009768, "eval_cos_sim": 0.08909547328948975, "eval_loss": 0.9119556422446913, "eval_runtime": 88.313, "eval_samples_per_second": 11.323, "eval_steps_per_second": 0.362, "step": 240 }, { "epoch": 0.10175010175010175, "grad_norm": 1.8255738019943237, "learning_rate": 8.479175145841813e-08, "loss": 0.9072, "step": 250 }, { "epoch": 0.10175010175010175, "eval_cos_sim": 0.08931572735309601, "eval_loss": 0.9117355642532057, "eval_runtime": 88.4029, "eval_samples_per_second": 11.312, "eval_steps_per_second": 0.362, "step": 250 }, { "epoch": 0.10582010582010581, "grad_norm": 1.8370403051376343, "learning_rate": 8.818342151675485e-08, "loss": 0.9077, "step": 260 }, { "epoch": 0.10582010582010581, "eval_cos_sim": 0.08954235166311264, "eval_loss": 0.9115091219161695, "eval_runtime": 87.6279, "eval_samples_per_second": 11.412, "eval_steps_per_second": 0.365, "step": 260 }, { "epoch": 0.10989010989010989, "grad_norm": 1.8193926811218262, "learning_rate": 9.157509157509157e-08, "loss": 0.9026, "step": 270 }, { "epoch": 0.10989010989010989, "eval_cos_sim": 0.08978428691625595, "eval_loss": 0.9112673835967725, "eval_runtime": 88.1258, "eval_samples_per_second": 11.347, "eval_steps_per_second": 0.363, "step": 270 }, { "epoch": 0.11396011396011396, "grad_norm": 1.8138810396194458, "learning_rate": 9.496676163342831e-08, "loss": 0.9062, "step": 280 }, { "epoch": 0.11396011396011396, "eval_cos_sim": 0.09003803133964539, "eval_loss": 0.9110138240073866, "eval_runtime": 88.6231, "eval_samples_per_second": 11.284, "eval_steps_per_second": 0.361, "step": 280 }, { "epoch": 0.11803011803011804, "grad_norm": 1.7783286571502686, "learning_rate": 9.835843169176503e-08, "loss": 0.9005, "step": 290 }, { "epoch": 0.11803011803011804, "eval_cos_sim": 0.09029672294855118, "eval_loss": 0.9107553434585279, "eval_runtime": 88.3342, "eval_samples_per_second": 11.321, "eval_steps_per_second": 0.362, "step": 290 }, { "epoch": 0.1221001221001221, "grad_norm": 1.5460050106048584, "learning_rate": 1.0175010175010174e-07, "loss": 0.9065, "step": 300 }, { "epoch": 0.1221001221001221, "eval_cos_sim": 0.09056524932384491, "eval_loss": 0.9104869976257032, "eval_runtime": 88.8774, "eval_samples_per_second": 11.251, "eval_steps_per_second": 0.36, "step": 300 }, { "epoch": 0.12617012617012616, "grad_norm": 1.8059020042419434, "learning_rate": 1.0514177180843848e-07, "loss": 0.9053, "step": 310 }, { "epoch": 0.12617012617012616, "eval_cos_sim": 0.09083421528339386, "eval_loss": 0.9102182526801771, "eval_runtime": 88.2523, "eval_samples_per_second": 11.331, "eval_steps_per_second": 0.363, "step": 310 }, { "epoch": 0.13024013024013023, "grad_norm": 1.8211084604263306, "learning_rate": 1.085334418667752e-07, "loss": 0.8972, "step": 320 }, { "epoch": 0.13024013024013023, "eval_cos_sim": 0.09111514687538147, "eval_loss": 0.9099375524734206, "eval_runtime": 89.0393, "eval_samples_per_second": 11.231, "eval_steps_per_second": 0.359, "step": 320 }, { "epoch": 0.1343101343101343, "grad_norm": 1.8337125778198242, "learning_rate": 1.1192511192511194e-07, "loss": 0.904, "step": 330 }, { "epoch": 0.1343101343101343, "eval_cos_sim": 0.09140148758888245, "eval_loss": 0.9096514249061293, "eval_runtime": 88.9867, "eval_samples_per_second": 11.238, "eval_steps_per_second": 0.36, "step": 330 }, { "epoch": 0.13838013838013838, "grad_norm": 1.800909399986267, "learning_rate": 1.1531678198344865e-07, "loss": 0.9007, "step": 340 }, { "epoch": 0.13838013838013838, "eval_cos_sim": 0.09169955551624298, "eval_loss": 0.9093535943244642, "eval_runtime": 88.8961, "eval_samples_per_second": 11.249, "eval_steps_per_second": 0.36, "step": 340 }, { "epoch": 0.14245014245014245, "grad_norm": 1.8311185836791992, "learning_rate": 1.1870845204178537e-07, "loss": 0.9039, "step": 350 }, { "epoch": 0.14245014245014245, "eval_cos_sim": 0.09200900048017502, "eval_loss": 0.9090443644736952, "eval_runtime": 89.4027, "eval_samples_per_second": 11.185, "eval_steps_per_second": 0.358, "step": 350 }, { "epoch": 0.14652014652014653, "grad_norm": 1.8273859024047852, "learning_rate": 1.221001221001221e-07, "loss": 0.9059, "step": 360 }, { "epoch": 0.14652014652014653, "eval_cos_sim": 0.09233282506465912, "eval_loss": 0.9087207708572096, "eval_runtime": 89.3567, "eval_samples_per_second": 11.191, "eval_steps_per_second": 0.358, "step": 360 }, { "epoch": 0.1505901505901506, "grad_norm": 1.788926124572754, "learning_rate": 1.2549179215845883e-07, "loss": 0.8978, "step": 370 }, { "epoch": 0.1505901505901506, "eval_cos_sim": 0.09266111254692078, "eval_loss": 0.9083927383636182, "eval_runtime": 88.8151, "eval_samples_per_second": 11.259, "eval_steps_per_second": 0.36, "step": 370 }, { "epoch": 0.15466015466015465, "grad_norm": 1.7970370054244995, "learning_rate": 1.2888346221679555e-07, "loss": 0.9034, "step": 380 }, { "epoch": 0.15466015466015465, "eval_cos_sim": 0.09299919754266739, "eval_loss": 0.9080548944686597, "eval_runtime": 88.7364, "eval_samples_per_second": 11.269, "eval_steps_per_second": 0.361, "step": 380 }, { "epoch": 0.15873015873015872, "grad_norm": 1.8325952291488647, "learning_rate": 1.3227513227513228e-07, "loss": 0.9008, "step": 390 }, { "epoch": 0.15873015873015872, "eval_cos_sim": 0.09334710985422134, "eval_loss": 0.9077072668288892, "eval_runtime": 88.2466, "eval_samples_per_second": 11.332, "eval_steps_per_second": 0.363, "step": 390 }, { "epoch": 0.1628001628001628, "grad_norm": 1.8238192796707153, "learning_rate": 1.35666802333469e-07, "loss": 0.8968, "step": 400 }, { "epoch": 0.1628001628001628, "eval_cos_sim": 0.0936974287033081, "eval_loss": 0.9073572283004468, "eval_runtime": 88.1359, "eval_samples_per_second": 11.346, "eval_steps_per_second": 0.363, "step": 400 }, { "epoch": 0.16687016687016687, "grad_norm": 1.44808828830719, "learning_rate": 1.3905847239180572e-07, "loss": 0.9039, "step": 410 }, { "epoch": 0.16687016687016687, "eval_cos_sim": 0.09405460208654404, "eval_loss": 0.9070003156875318, "eval_runtime": 88.1049, "eval_samples_per_second": 11.35, "eval_steps_per_second": 0.363, "step": 410 }, { "epoch": 0.17094017094017094, "grad_norm": 1.8011168241500854, "learning_rate": 1.4245014245014247e-07, "loss": 0.8996, "step": 420 }, { "epoch": 0.17094017094017094, "eval_cos_sim": 0.09441856294870377, "eval_loss": 0.9066366181586927, "eval_runtime": 88.4922, "eval_samples_per_second": 11.3, "eval_steps_per_second": 0.362, "step": 420 }, { "epoch": 0.17501017501017502, "grad_norm": 1.5176538228988647, "learning_rate": 1.4584181250847917e-07, "loss": 0.9017, "step": 430 }, { "epoch": 0.17501017501017502, "eval_cos_sim": 0.09479890018701553, "eval_loss": 0.9062565398429578, "eval_runtime": 88.5907, "eval_samples_per_second": 11.288, "eval_steps_per_second": 0.361, "step": 430 }, { "epoch": 0.1790801790801791, "grad_norm": 1.7999022006988525, "learning_rate": 1.492334825668159e-07, "loss": 0.8991, "step": 440 }, { "epoch": 0.1790801790801791, "eval_cos_sim": 0.09518194943666458, "eval_loss": 0.9058737893317884, "eval_runtime": 88.7001, "eval_samples_per_second": 11.274, "eval_steps_per_second": 0.361, "step": 440 }, { "epoch": 0.18315018315018314, "grad_norm": 1.7988156080245972, "learning_rate": 1.5262515262515264e-07, "loss": 0.895, "step": 450 }, { "epoch": 0.18315018315018314, "eval_cos_sim": 0.09557987004518509, "eval_loss": 0.9054762168143934, "eval_runtime": 88.8417, "eval_samples_per_second": 11.256, "eval_steps_per_second": 0.36, "step": 450 }, { "epoch": 0.1872201872201872, "grad_norm": 1.8262207508087158, "learning_rate": 1.5601682268348936e-07, "loss": 0.897, "step": 460 }, { "epoch": 0.1872201872201872, "eval_cos_sim": 0.0959911048412323, "eval_loss": 0.905065335771913, "eval_runtime": 88.9908, "eval_samples_per_second": 11.237, "eval_steps_per_second": 0.36, "step": 460 }, { "epoch": 0.19129019129019129, "grad_norm": 1.551401138305664, "learning_rate": 1.594084927418261e-07, "loss": 0.8989, "step": 470 }, { "epoch": 0.19129019129019129, "eval_cos_sim": 0.09642157703638077, "eval_loss": 0.9046352105354017, "eval_runtime": 88.8499, "eval_samples_per_second": 11.255, "eval_steps_per_second": 0.36, "step": 470 }, { "epoch": 0.19536019536019536, "grad_norm": 1.5423153638839722, "learning_rate": 1.628001628001628e-07, "loss": 0.8987, "step": 480 }, { "epoch": 0.19536019536019536, "eval_cos_sim": 0.09685715287923813, "eval_loss": 0.9041999774192518, "eval_runtime": 88.8179, "eval_samples_per_second": 11.259, "eval_steps_per_second": 0.36, "step": 480 }, { "epoch": 0.19943019943019943, "grad_norm": 1.53138267993927, "learning_rate": 1.6619183285849953e-07, "loss": 0.8964, "step": 490 }, { "epoch": 0.19943019943019943, "eval_cos_sim": 0.09730342030525208, "eval_loss": 0.9037540726875013, "eval_runtime": 89.2751, "eval_samples_per_second": 11.201, "eval_steps_per_second": 0.358, "step": 490 }, { "epoch": 0.2035002035002035, "grad_norm": 1.8261990547180176, "learning_rate": 1.6958350291683626e-07, "loss": 0.8933, "step": 500 }, { "epoch": 0.2035002035002035, "eval_cos_sim": 0.0977540910243988, "eval_loss": 0.9033037514899915, "eval_runtime": 88.8398, "eval_samples_per_second": 11.256, "eval_steps_per_second": 0.36, "step": 500 }, { "epoch": 0.20757020757020758, "grad_norm": 1.5531574487686157, "learning_rate": 1.7297517297517298e-07, "loss": 0.8969, "step": 510 }, { "epoch": 0.20757020757020758, "eval_cos_sim": 0.09821291267871857, "eval_loss": 0.9028452673171705, "eval_runtime": 88.8235, "eval_samples_per_second": 11.258, "eval_steps_per_second": 0.36, "step": 510 }, { "epoch": 0.21164021164021163, "grad_norm": 1.8243498802185059, "learning_rate": 1.763668430335097e-07, "loss": 0.8912, "step": 520 }, { "epoch": 0.21164021164021163, "eval_cos_sim": 0.0986839011311531, "eval_loss": 0.902374650022859, "eval_runtime": 89.4186, "eval_samples_per_second": 11.183, "eval_steps_per_second": 0.358, "step": 520 }, { "epoch": 0.2157102157102157, "grad_norm": 1.8204760551452637, "learning_rate": 1.7975851309184642e-07, "loss": 0.8908, "step": 530 }, { "epoch": 0.2157102157102157, "eval_cos_sim": 0.099166639149189, "eval_loss": 0.9018922744010633, "eval_runtime": 89.0165, "eval_samples_per_second": 11.234, "eval_steps_per_second": 0.359, "step": 530 }, { "epoch": 0.21978021978021978, "grad_norm": 1.4353723526000977, "learning_rate": 1.8315018315018315e-07, "loss": 0.8981, "step": 540 }, { "epoch": 0.21978021978021978, "eval_cos_sim": 0.09965574741363525, "eval_loss": 0.9014035372947401, "eval_runtime": 88.7816, "eval_samples_per_second": 11.264, "eval_steps_per_second": 0.36, "step": 540 }, { "epoch": 0.22385022385022385, "grad_norm": 1.8110315799713135, "learning_rate": 1.865418532085199e-07, "loss": 0.8933, "step": 550 }, { "epoch": 0.22385022385022385, "eval_cos_sim": 0.10014832764863968, "eval_loss": 0.9009113145087904, "eval_runtime": 89.8055, "eval_samples_per_second": 11.135, "eval_steps_per_second": 0.356, "step": 550 }, { "epoch": 0.22792022792022792, "grad_norm": 1.4287735223770142, "learning_rate": 1.8993352326685662e-07, "loss": 0.899, "step": 560 }, { "epoch": 0.22792022792022792, "eval_cos_sim": 0.1006506159901619, "eval_loss": 0.9004093914245314, "eval_runtime": 90.0033, "eval_samples_per_second": 11.111, "eval_steps_per_second": 0.356, "step": 560 }, { "epoch": 0.231990231990232, "grad_norm": 1.8295832872390747, "learning_rate": 1.9332519332519332e-07, "loss": 0.896, "step": 570 }, { "epoch": 0.231990231990232, "eval_cos_sim": 0.10114751756191254, "eval_loss": 0.8999129333709425, "eval_runtime": 89.8231, "eval_samples_per_second": 11.133, "eval_steps_per_second": 0.356, "step": 570 }, { "epoch": 0.23606023606023607, "grad_norm": 1.831000804901123, "learning_rate": 1.9671686338353007e-07, "loss": 0.8949, "step": 580 }, { "epoch": 0.23606023606023607, "eval_cos_sim": 0.10166067630052567, "eval_loss": 0.8994001808379835, "eval_runtime": 90.3377, "eval_samples_per_second": 11.07, "eval_steps_per_second": 0.354, "step": 580 }, { "epoch": 0.24013024013024012, "grad_norm": 1.7988686561584473, "learning_rate": 2.001085334418668e-07, "loss": 0.8908, "step": 590 }, { "epoch": 0.24013024013024012, "eval_cos_sim": 0.10217340290546417, "eval_loss": 0.8988878832076734, "eval_runtime": 90.1812, "eval_samples_per_second": 11.089, "eval_steps_per_second": 0.355, "step": 590 }, { "epoch": 0.2442002442002442, "grad_norm": 1.835419774055481, "learning_rate": 2.0350020350020349e-07, "loss": 0.8902, "step": 600 }, { "epoch": 0.2442002442002442, "eval_cos_sim": 0.10271025449037552, "eval_loss": 0.8983514561866468, "eval_runtime": 89.9875, "eval_samples_per_second": 11.113, "eval_steps_per_second": 0.356, "step": 600 }, { "epoch": 0.24827024827024827, "grad_norm": 1.5257744789123535, "learning_rate": 2.0689187355854024e-07, "loss": 0.8949, "step": 610 }, { "epoch": 0.24827024827024827, "eval_cos_sim": 0.10325058549642563, "eval_loss": 0.8978115306113905, "eval_runtime": 89.7723, "eval_samples_per_second": 11.139, "eval_steps_per_second": 0.356, "step": 610 }, { "epoch": 0.2523402523402523, "grad_norm": 1.5515631437301636, "learning_rate": 2.1028354361687696e-07, "loss": 0.8871, "step": 620 }, { "epoch": 0.2523402523402523, "eval_cos_sim": 0.10381077229976654, "eval_loss": 0.8972517957900709, "eval_runtime": 89.3737, "eval_samples_per_second": 11.189, "eval_steps_per_second": 0.358, "step": 620 }, { "epoch": 0.2564102564102564, "grad_norm": 1.5338140726089478, "learning_rate": 2.136752136752137e-07, "loss": 0.8885, "step": 630 }, { "epoch": 0.2564102564102564, "eval_cos_sim": 0.10438349843025208, "eval_loss": 0.896679551622743, "eval_runtime": 89.6231, "eval_samples_per_second": 11.158, "eval_steps_per_second": 0.357, "step": 630 }, { "epoch": 0.26048026048026046, "grad_norm": 1.8301513195037842, "learning_rate": 2.170668837335504e-07, "loss": 0.887, "step": 640 }, { "epoch": 0.26048026048026046, "eval_cos_sim": 0.10495594143867493, "eval_loss": 0.8961075730537122, "eval_runtime": 90.7656, "eval_samples_per_second": 11.017, "eval_steps_per_second": 0.353, "step": 640 }, { "epoch": 0.26455026455026454, "grad_norm": 1.5466346740722656, "learning_rate": 2.2045855379188713e-07, "loss": 0.8892, "step": 650 }, { "epoch": 0.26455026455026454, "eval_cos_sim": 0.10554695129394531, "eval_loss": 0.8955170693610853, "eval_runtime": 90.0073, "eval_samples_per_second": 11.11, "eval_steps_per_second": 0.356, "step": 650 }, { "epoch": 0.2686202686202686, "grad_norm": 1.811012864112854, "learning_rate": 2.2385022385022388e-07, "loss": 0.8844, "step": 660 }, { "epoch": 0.2686202686202686, "eval_cos_sim": 0.10614697635173798, "eval_loss": 0.8949175381873793, "eval_runtime": 91.0107, "eval_samples_per_second": 10.988, "eval_steps_per_second": 0.352, "step": 660 }, { "epoch": 0.2726902726902727, "grad_norm": 1.6698360443115234, "learning_rate": 2.2724189390856057e-07, "loss": 0.8877, "step": 670 }, { "epoch": 0.2726902726902727, "eval_cos_sim": 0.10675283521413803, "eval_loss": 0.8943121800635999, "eval_runtime": 90.5845, "eval_samples_per_second": 11.039, "eval_steps_per_second": 0.353, "step": 670 } ], "logging_steps": 10, "max_steps": 1474200, "num_input_tokens_seen": 0, "num_train_epochs": 600, "save_steps": 10, "total_flos": 0.0, "train_batch_size": 160, "trial_name": null, "trial_params": null }