{ "best_metric": 1.6563588380813599, "best_model_checkpoint": "4bit_repro_03022025/host9_seed_42_full_det_fp16_no_flash_attn_fix_pad_llama-3.2-instruct-l16-no-cot-4ep-lr3e04-ws20-bs8-ga4-fp16-07022025/checkpoint-110", "epoch": 3.9357798165137616, "eval_steps": 500, "global_step": 216, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01834862385321101, "grad_norm": 1.1623170375823975, "learning_rate": 1.4999999999999999e-05, "loss": 3.8107, "step": 1 }, { "epoch": 0.03669724770642202, "grad_norm": 1.2679989337921143, "learning_rate": 2.9999999999999997e-05, "loss": 3.6972, "step": 2 }, { "epoch": 0.05504587155963303, "grad_norm": 1.7297697067260742, "learning_rate": 4.4999999999999996e-05, "loss": 3.9797, "step": 3 }, { "epoch": 0.07339449541284404, "grad_norm": 0.9575896263122559, "learning_rate": 5.9999999999999995e-05, "loss": 3.7022, "step": 4 }, { "epoch": 0.09174311926605505, "grad_norm": 1.3155354261398315, "learning_rate": 7.5e-05, "loss": 3.678, "step": 5 }, { "epoch": 0.11009174311926606, "grad_norm": 1.0387929677963257, "learning_rate": 8.999999999999999e-05, "loss": 3.4512, "step": 6 }, { "epoch": 0.12844036697247707, "grad_norm": 1.3004755973815918, "learning_rate": 0.00010499999999999999, "loss": 3.3679, "step": 7 }, { "epoch": 0.14678899082568808, "grad_norm": 0.92799311876297, "learning_rate": 0.00011999999999999999, "loss": 3.4198, "step": 8 }, { "epoch": 0.1651376146788991, "grad_norm": 1.3040410280227661, "learning_rate": 0.000135, "loss": 3.1026, "step": 9 }, { "epoch": 0.1834862385321101, "grad_norm": 0.9226571321487427, "learning_rate": 0.00015, "loss": 3.0485, "step": 10 }, { "epoch": 0.2018348623853211, "grad_norm": 0.5856759548187256, "learning_rate": 0.000165, "loss": 2.9433, "step": 11 }, { "epoch": 0.22018348623853212, "grad_norm": 0.6692156791687012, "learning_rate": 0.00017999999999999998, "loss": 2.7925, "step": 12 }, { "epoch": 0.23853211009174313, "grad_norm": 0.5744884014129639, "learning_rate": 0.000195, "loss": 2.9089, "step": 13 }, { "epoch": 0.25688073394495414, "grad_norm": 0.7481144070625305, "learning_rate": 0.00020999999999999998, "loss": 2.5451, "step": 14 }, { "epoch": 0.27522935779816515, "grad_norm": 0.5582759380340576, "learning_rate": 0.000225, "loss": 2.9272, "step": 15 }, { "epoch": 0.29357798165137616, "grad_norm": 0.6636083722114563, "learning_rate": 0.00023999999999999998, "loss": 2.5954, "step": 16 }, { "epoch": 0.3119266055045872, "grad_norm": 0.8177175521850586, "learning_rate": 0.00025499999999999996, "loss": 2.3798, "step": 17 }, { "epoch": 0.3302752293577982, "grad_norm": 0.6705996990203857, "learning_rate": 0.00027, "loss": 2.4108, "step": 18 }, { "epoch": 0.3486238532110092, "grad_norm": 0.602409839630127, "learning_rate": 0.000285, "loss": 2.7386, "step": 19 }, { "epoch": 0.3669724770642202, "grad_norm": 0.569008469581604, "learning_rate": 0.0003, "loss": 2.5557, "step": 20 }, { "epoch": 0.3853211009174312, "grad_norm": 0.5509985089302063, "learning_rate": 0.00029846938775510205, "loss": 2.3005, "step": 21 }, { "epoch": 0.4036697247706422, "grad_norm": 0.5118123888969421, "learning_rate": 0.0002969387755102041, "loss": 2.3586, "step": 22 }, { "epoch": 0.42201834862385323, "grad_norm": 0.4925268590450287, "learning_rate": 0.0002954081632653061, "loss": 2.4392, "step": 23 }, { "epoch": 0.44036697247706424, "grad_norm": 0.46481165289878845, "learning_rate": 0.0002938775510204081, "loss": 2.1959, "step": 24 }, { "epoch": 0.45871559633027525, "grad_norm": 0.4252929985523224, "learning_rate": 0.0002923469387755102, "loss": 1.9135, "step": 25 }, { "epoch": 0.47706422018348627, "grad_norm": 0.36956462264060974, "learning_rate": 0.00029081632653061223, "loss": 2.3824, "step": 26 }, { "epoch": 0.4954128440366973, "grad_norm": 0.37660282850265503, "learning_rate": 0.00028928571428571425, "loss": 1.7635, "step": 27 }, { "epoch": 0.5137614678899083, "grad_norm": 0.38930436968803406, "learning_rate": 0.0002877551020408163, "loss": 2.365, "step": 28 }, { "epoch": 0.5321100917431193, "grad_norm": 0.3706170916557312, "learning_rate": 0.00028622448979591836, "loss": 2.1775, "step": 29 }, { "epoch": 0.5504587155963303, "grad_norm": 0.3654409945011139, "learning_rate": 0.0002846938775510204, "loss": 2.2779, "step": 30 }, { "epoch": 0.5688073394495413, "grad_norm": 0.34728649258613586, "learning_rate": 0.0002831632653061224, "loss": 2.1496, "step": 31 }, { "epoch": 0.5871559633027523, "grad_norm": 0.3580109477043152, "learning_rate": 0.0002816326530612245, "loss": 1.8089, "step": 32 }, { "epoch": 0.6055045871559633, "grad_norm": 0.359658807516098, "learning_rate": 0.0002801020408163265, "loss": 1.9021, "step": 33 }, { "epoch": 0.6238532110091743, "grad_norm": 0.325698584318161, "learning_rate": 0.00027857142857142854, "loss": 2.2566, "step": 34 }, { "epoch": 0.6422018348623854, "grad_norm": 0.33368322253227234, "learning_rate": 0.00027704081632653056, "loss": 2.0936, "step": 35 }, { "epoch": 0.6605504587155964, "grad_norm": 0.3149988651275635, "learning_rate": 0.00027551020408163264, "loss": 2.1885, "step": 36 }, { "epoch": 0.6788990825688074, "grad_norm": 0.3853766918182373, "learning_rate": 0.00027397959183673466, "loss": 2.2931, "step": 37 }, { "epoch": 0.6972477064220184, "grad_norm": 0.39314553141593933, "learning_rate": 0.0002724489795918367, "loss": 2.2779, "step": 38 }, { "epoch": 0.7155963302752294, "grad_norm": 0.3782222270965576, "learning_rate": 0.0002709183673469387, "loss": 1.6485, "step": 39 }, { "epoch": 0.7339449541284404, "grad_norm": 0.39382827281951904, "learning_rate": 0.0002693877551020408, "loss": 2.3373, "step": 40 }, { "epoch": 0.7522935779816514, "grad_norm": 0.3778757154941559, "learning_rate": 0.00026785714285714287, "loss": 2.0118, "step": 41 }, { "epoch": 0.7706422018348624, "grad_norm": 0.40335342288017273, "learning_rate": 0.0002663265306122449, "loss": 1.9141, "step": 42 }, { "epoch": 0.7889908256880734, "grad_norm": 0.352906197309494, "learning_rate": 0.0002647959183673469, "loss": 2.0026, "step": 43 }, { "epoch": 0.8073394495412844, "grad_norm": 0.36179277300834656, "learning_rate": 0.00026326530612244894, "loss": 2.2164, "step": 44 }, { "epoch": 0.8256880733944955, "grad_norm": 0.39216384291648865, "learning_rate": 0.000261734693877551, "loss": 1.4207, "step": 45 }, { "epoch": 0.8440366972477065, "grad_norm": 0.3671320080757141, "learning_rate": 0.00026020408163265305, "loss": 2.1397, "step": 46 }, { "epoch": 0.8623853211009175, "grad_norm": 0.4085288643836975, "learning_rate": 0.00025867346938775507, "loss": 1.7143, "step": 47 }, { "epoch": 0.8807339449541285, "grad_norm": 0.41445520520210266, "learning_rate": 0.0002571428571428571, "loss": 2.019, "step": 48 }, { "epoch": 0.8990825688073395, "grad_norm": 0.41389527916908264, "learning_rate": 0.0002556122448979592, "loss": 1.8466, "step": 49 }, { "epoch": 0.9174311926605505, "grad_norm": 0.4167148768901825, "learning_rate": 0.0002540816326530612, "loss": 1.8376, "step": 50 }, { "epoch": 0.9357798165137615, "grad_norm": 0.3998531997203827, "learning_rate": 0.0002525510204081632, "loss": 1.6899, "step": 51 }, { "epoch": 0.9541284403669725, "grad_norm": 0.4575762450695038, "learning_rate": 0.0002510204081632653, "loss": 1.336, "step": 52 }, { "epoch": 0.9724770642201835, "grad_norm": 0.4299473166465759, "learning_rate": 0.00024948979591836733, "loss": 1.7322, "step": 53 }, { "epoch": 0.9908256880733946, "grad_norm": 0.41627636551856995, "learning_rate": 0.00024795918367346935, "loss": 1.7616, "step": 54 }, { "epoch": 1.0, "grad_norm": 0.6094969511032104, "learning_rate": 0.0002464285714285714, "loss": 1.8861, "step": 55 }, { "epoch": 1.0, "eval_loss": 1.7122242450714111, "eval_runtime": 38.3867, "eval_samples_per_second": 8.649, "eval_steps_per_second": 4.324, "step": 55 }, { "epoch": 1.018348623853211, "grad_norm": 0.4282676577568054, "learning_rate": 0.00024489795918367346, "loss": 1.6534, "step": 56 }, { "epoch": 1.036697247706422, "grad_norm": 0.5036994218826294, "learning_rate": 0.00024336734693877548, "loss": 1.9468, "step": 57 }, { "epoch": 1.0550458715596331, "grad_norm": 0.43860867619514465, "learning_rate": 0.00024183673469387753, "loss": 1.7436, "step": 58 }, { "epoch": 1.073394495412844, "grad_norm": 0.4401974678039551, "learning_rate": 0.00024030612244897956, "loss": 1.747, "step": 59 }, { "epoch": 1.091743119266055, "grad_norm": 0.4853396713733673, "learning_rate": 0.0002387755102040816, "loss": 1.6576, "step": 60 }, { "epoch": 1.110091743119266, "grad_norm": 0.41972073912620544, "learning_rate": 0.00023724489795918366, "loss": 1.9592, "step": 61 }, { "epoch": 1.1284403669724772, "grad_norm": 0.4083613157272339, "learning_rate": 0.00023571428571428569, "loss": 1.9099, "step": 62 }, { "epoch": 1.146788990825688, "grad_norm": 0.4990442991256714, "learning_rate": 0.00023418367346938774, "loss": 1.7113, "step": 63 }, { "epoch": 1.165137614678899, "grad_norm": 0.45442479848861694, "learning_rate": 0.00023265306122448976, "loss": 1.5857, "step": 64 }, { "epoch": 1.18348623853211, "grad_norm": 0.4256112575531006, "learning_rate": 0.00023112244897959181, "loss": 1.7259, "step": 65 }, { "epoch": 1.2018348623853212, "grad_norm": 0.4265333414077759, "learning_rate": 0.00022959183673469384, "loss": 1.6525, "step": 66 }, { "epoch": 1.2201834862385321, "grad_norm": 0.44959592819213867, "learning_rate": 0.0002280612244897959, "loss": 1.4174, "step": 67 }, { "epoch": 1.238532110091743, "grad_norm": 0.5054103136062622, "learning_rate": 0.00022653061224489791, "loss": 1.6872, "step": 68 }, { "epoch": 1.2568807339449541, "grad_norm": 0.4578041732311249, "learning_rate": 0.000225, "loss": 1.568, "step": 69 }, { "epoch": 1.2752293577981653, "grad_norm": 0.4960290789604187, "learning_rate": 0.00022346938775510205, "loss": 1.8964, "step": 70 }, { "epoch": 1.2935779816513762, "grad_norm": 0.513982355594635, "learning_rate": 0.00022193877551020407, "loss": 1.5835, "step": 71 }, { "epoch": 1.311926605504587, "grad_norm": 0.42694035172462463, "learning_rate": 0.00022040816326530612, "loss": 1.7641, "step": 72 }, { "epoch": 1.3302752293577982, "grad_norm": 0.4448159337043762, "learning_rate": 0.00021887755102040815, "loss": 1.7207, "step": 73 }, { "epoch": 1.3486238532110093, "grad_norm": 0.45734143257141113, "learning_rate": 0.0002173469387755102, "loss": 1.8571, "step": 74 }, { "epoch": 1.3669724770642202, "grad_norm": 0.4641169607639313, "learning_rate": 0.00021581632653061222, "loss": 1.5389, "step": 75 }, { "epoch": 1.385321100917431, "grad_norm": 0.5520796179771423, "learning_rate": 0.00021428571428571427, "loss": 1.5691, "step": 76 }, { "epoch": 1.4036697247706422, "grad_norm": 0.5665276050567627, "learning_rate": 0.0002127551020408163, "loss": 1.634, "step": 77 }, { "epoch": 1.4220183486238533, "grad_norm": 0.46170687675476074, "learning_rate": 0.00021122448979591835, "loss": 1.5442, "step": 78 }, { "epoch": 1.4403669724770642, "grad_norm": 0.5029768943786621, "learning_rate": 0.0002096938775510204, "loss": 1.5408, "step": 79 }, { "epoch": 1.4587155963302751, "grad_norm": 0.4695865213871002, "learning_rate": 0.00020816326530612243, "loss": 1.5197, "step": 80 }, { "epoch": 1.4770642201834863, "grad_norm": 0.5550664067268372, "learning_rate": 0.00020663265306122448, "loss": 1.6369, "step": 81 }, { "epoch": 1.4954128440366974, "grad_norm": 0.5457161664962769, "learning_rate": 0.0002051020408163265, "loss": 1.6256, "step": 82 }, { "epoch": 1.5137614678899083, "grad_norm": 0.527881920337677, "learning_rate": 0.00020357142857142856, "loss": 1.4423, "step": 83 }, { "epoch": 1.5321100917431192, "grad_norm": 0.5205135941505432, "learning_rate": 0.00020204081632653058, "loss": 1.4559, "step": 84 }, { "epoch": 1.5504587155963303, "grad_norm": 1.0103683471679688, "learning_rate": 0.00020051020408163263, "loss": 1.7318, "step": 85 }, { "epoch": 1.5688073394495414, "grad_norm": 0.5606263279914856, "learning_rate": 0.00019897959183673466, "loss": 1.5608, "step": 86 }, { "epoch": 1.5871559633027523, "grad_norm": 0.5933471322059631, "learning_rate": 0.0001974489795918367, "loss": 1.6343, "step": 87 }, { "epoch": 1.6055045871559632, "grad_norm": 0.5846021771430969, "learning_rate": 0.00019591836734693873, "loss": 1.3335, "step": 88 }, { "epoch": 1.6238532110091743, "grad_norm": 0.6300230622291565, "learning_rate": 0.0001943877551020408, "loss": 1.3714, "step": 89 }, { "epoch": 1.6422018348623855, "grad_norm": 0.6161322593688965, "learning_rate": 0.00019285714285714286, "loss": 1.5617, "step": 90 }, { "epoch": 1.6605504587155964, "grad_norm": 0.7593424320220947, "learning_rate": 0.0001913265306122449, "loss": 1.3631, "step": 91 }, { "epoch": 1.6788990825688073, "grad_norm": 0.7726075053215027, "learning_rate": 0.00018979591836734694, "loss": 1.3682, "step": 92 }, { "epoch": 1.6972477064220184, "grad_norm": 0.6755046248435974, "learning_rate": 0.00018826530612244896, "loss": 1.3317, "step": 93 }, { "epoch": 1.7155963302752295, "grad_norm": 0.7959868311882019, "learning_rate": 0.00018673469387755102, "loss": 1.4428, "step": 94 }, { "epoch": 1.7339449541284404, "grad_norm": 0.7094178199768066, "learning_rate": 0.00018520408163265304, "loss": 1.4291, "step": 95 }, { "epoch": 1.7522935779816513, "grad_norm": 0.6024708151817322, "learning_rate": 0.0001836734693877551, "loss": 1.5103, "step": 96 }, { "epoch": 1.7706422018348624, "grad_norm": 0.682614803314209, "learning_rate": 0.00018214285714285712, "loss": 1.4783, "step": 97 }, { "epoch": 1.7889908256880735, "grad_norm": 0.7238509058952332, "learning_rate": 0.00018061224489795917, "loss": 1.5119, "step": 98 }, { "epoch": 1.8073394495412844, "grad_norm": 0.5742793679237366, "learning_rate": 0.00017908163265306122, "loss": 1.48, "step": 99 }, { "epoch": 1.8256880733944953, "grad_norm": 0.6613648533821106, "learning_rate": 0.00017755102040816325, "loss": 1.3407, "step": 100 }, { "epoch": 1.8440366972477065, "grad_norm": 0.6030130982398987, "learning_rate": 0.0001760204081632653, "loss": 1.4895, "step": 101 }, { "epoch": 1.8623853211009176, "grad_norm": 0.7113239169120789, "learning_rate": 0.00017448979591836732, "loss": 1.4172, "step": 102 }, { "epoch": 1.8807339449541285, "grad_norm": 0.655587911605835, "learning_rate": 0.00017295918367346937, "loss": 1.5067, "step": 103 }, { "epoch": 1.8990825688073394, "grad_norm": 0.5982323288917542, "learning_rate": 0.0001714285714285714, "loss": 1.3779, "step": 104 }, { "epoch": 1.9174311926605505, "grad_norm": 0.5400868058204651, "learning_rate": 0.00016989795918367345, "loss": 1.1337, "step": 105 }, { "epoch": 1.9357798165137616, "grad_norm": 0.7091734409332275, "learning_rate": 0.00016836734693877547, "loss": 1.4084, "step": 106 }, { "epoch": 1.9541284403669725, "grad_norm": 0.6091794371604919, "learning_rate": 0.00016683673469387753, "loss": 1.3272, "step": 107 }, { "epoch": 1.9724770642201834, "grad_norm": 0.6288979053497314, "learning_rate": 0.00016530612244897955, "loss": 1.242, "step": 108 }, { "epoch": 1.9908256880733946, "grad_norm": 0.644861102104187, "learning_rate": 0.00016377551020408163, "loss": 1.3282, "step": 109 }, { "epoch": 2.0, "grad_norm": 0.8754244446754456, "learning_rate": 0.00016224489795918368, "loss": 1.1291, "step": 110 }, { "epoch": 2.0, "eval_loss": 1.6563588380813599, "eval_runtime": 38.2656, "eval_samples_per_second": 8.676, "eval_steps_per_second": 4.338, "step": 110 }, { "epoch": 2.018348623853211, "grad_norm": 0.7026936411857605, "learning_rate": 0.0001607142857142857, "loss": 1.3301, "step": 111 }, { "epoch": 2.036697247706422, "grad_norm": 0.7205306887626648, "learning_rate": 0.00015918367346938776, "loss": 1.3148, "step": 112 }, { "epoch": 2.055045871559633, "grad_norm": 0.6323193907737732, "learning_rate": 0.00015765306122448978, "loss": 1.2063, "step": 113 }, { "epoch": 2.073394495412844, "grad_norm": 0.6059937477111816, "learning_rate": 0.00015612244897959183, "loss": 1.3092, "step": 114 }, { "epoch": 2.091743119266055, "grad_norm": 0.6562811136245728, "learning_rate": 0.00015459183673469386, "loss": 1.1997, "step": 115 }, { "epoch": 2.1100917431192663, "grad_norm": 0.5683314800262451, "learning_rate": 0.0001530612244897959, "loss": 0.9851, "step": 116 }, { "epoch": 2.128440366972477, "grad_norm": 0.6609128713607788, "learning_rate": 0.00015153061224489794, "loss": 1.2177, "step": 117 }, { "epoch": 2.146788990825688, "grad_norm": 0.6697719693183899, "learning_rate": 0.00015, "loss": 1.1673, "step": 118 }, { "epoch": 2.165137614678899, "grad_norm": 0.781191349029541, "learning_rate": 0.00014846938775510204, "loss": 1.2994, "step": 119 }, { "epoch": 2.18348623853211, "grad_norm": 0.7695513367652893, "learning_rate": 0.00014693877551020406, "loss": 1.1544, "step": 120 }, { "epoch": 2.2018348623853212, "grad_norm": 0.6256545186042786, "learning_rate": 0.00014540816326530611, "loss": 1.2283, "step": 121 }, { "epoch": 2.220183486238532, "grad_norm": 0.8519290089607239, "learning_rate": 0.00014387755102040814, "loss": 1.338, "step": 122 }, { "epoch": 2.238532110091743, "grad_norm": 0.7241263389587402, "learning_rate": 0.0001423469387755102, "loss": 1.2604, "step": 123 }, { "epoch": 2.2568807339449544, "grad_norm": 0.5714284181594849, "learning_rate": 0.00014081632653061224, "loss": 1.2475, "step": 124 }, { "epoch": 2.2752293577981653, "grad_norm": 0.6091315746307373, "learning_rate": 0.00013928571428571427, "loss": 1.1513, "step": 125 }, { "epoch": 2.293577981651376, "grad_norm": 0.5888165235519409, "learning_rate": 0.00013775510204081632, "loss": 0.9258, "step": 126 }, { "epoch": 2.311926605504587, "grad_norm": 0.7953323721885681, "learning_rate": 0.00013622448979591834, "loss": 1.2609, "step": 127 }, { "epoch": 2.330275229357798, "grad_norm": 0.6867144107818604, "learning_rate": 0.0001346938775510204, "loss": 1.2602, "step": 128 }, { "epoch": 2.3486238532110093, "grad_norm": 0.6500101685523987, "learning_rate": 0.00013316326530612245, "loss": 1.0713, "step": 129 }, { "epoch": 2.36697247706422, "grad_norm": 0.9171455502510071, "learning_rate": 0.00013163265306122447, "loss": 1.1521, "step": 130 }, { "epoch": 2.385321100917431, "grad_norm": 0.6744725108146667, "learning_rate": 0.00013010204081632652, "loss": 1.1871, "step": 131 }, { "epoch": 2.4036697247706424, "grad_norm": 0.7064054012298584, "learning_rate": 0.00012857142857142855, "loss": 1.1598, "step": 132 }, { "epoch": 2.4220183486238533, "grad_norm": 0.7427341938018799, "learning_rate": 0.0001270408163265306, "loss": 1.101, "step": 133 }, { "epoch": 2.4403669724770642, "grad_norm": 0.6846525073051453, "learning_rate": 0.00012551020408163265, "loss": 1.0844, "step": 134 }, { "epoch": 2.458715596330275, "grad_norm": 0.6846270561218262, "learning_rate": 0.00012397959183673468, "loss": 1.114, "step": 135 }, { "epoch": 2.477064220183486, "grad_norm": 0.6074003577232361, "learning_rate": 0.00012244897959183673, "loss": 1.1264, "step": 136 }, { "epoch": 2.4954128440366974, "grad_norm": 0.639379620552063, "learning_rate": 0.00012091836734693877, "loss": 1.2136, "step": 137 }, { "epoch": 2.5137614678899083, "grad_norm": 0.6055455803871155, "learning_rate": 0.0001193877551020408, "loss": 1.0061, "step": 138 }, { "epoch": 2.532110091743119, "grad_norm": 0.6591995358467102, "learning_rate": 0.00011785714285714284, "loss": 1.0949, "step": 139 }, { "epoch": 2.5504587155963305, "grad_norm": 0.6247413754463196, "learning_rate": 0.00011632653061224488, "loss": 1.0368, "step": 140 }, { "epoch": 2.5688073394495414, "grad_norm": 1.265088438987732, "learning_rate": 0.00011479591836734692, "loss": 1.1696, "step": 141 }, { "epoch": 2.5871559633027523, "grad_norm": 0.6811426281929016, "learning_rate": 0.00011326530612244896, "loss": 1.0469, "step": 142 }, { "epoch": 2.6055045871559632, "grad_norm": 0.7476761937141418, "learning_rate": 0.00011173469387755102, "loss": 1.0976, "step": 143 }, { "epoch": 2.623853211009174, "grad_norm": 0.6777238249778748, "learning_rate": 0.00011020408163265306, "loss": 1.1342, "step": 144 }, { "epoch": 2.6422018348623855, "grad_norm": 0.6841846704483032, "learning_rate": 0.0001086734693877551, "loss": 0.9781, "step": 145 }, { "epoch": 2.6605504587155964, "grad_norm": 0.7566942572593689, "learning_rate": 0.00010714285714285714, "loss": 1.2729, "step": 146 }, { "epoch": 2.6788990825688073, "grad_norm": 0.7708871364593506, "learning_rate": 0.00010561224489795918, "loss": 1.0331, "step": 147 }, { "epoch": 2.6972477064220186, "grad_norm": 0.7078199982643127, "learning_rate": 0.00010408163265306121, "loss": 1.0019, "step": 148 }, { "epoch": 2.7155963302752295, "grad_norm": 0.6287170052528381, "learning_rate": 0.00010255102040816325, "loss": 1.1889, "step": 149 }, { "epoch": 2.7339449541284404, "grad_norm": 0.6501107215881348, "learning_rate": 0.00010102040816326529, "loss": 1.1433, "step": 150 }, { "epoch": 2.7522935779816513, "grad_norm": 0.7870299816131592, "learning_rate": 9.948979591836733e-05, "loss": 1.0418, "step": 151 }, { "epoch": 2.770642201834862, "grad_norm": 0.6767966151237488, "learning_rate": 9.795918367346937e-05, "loss": 1.0161, "step": 152 }, { "epoch": 2.7889908256880735, "grad_norm": 0.6706618666648865, "learning_rate": 9.642857142857143e-05, "loss": 1.0978, "step": 153 }, { "epoch": 2.8073394495412844, "grad_norm": 0.6994735598564148, "learning_rate": 9.489795918367347e-05, "loss": 0.9295, "step": 154 }, { "epoch": 2.8256880733944953, "grad_norm": 0.6585462093353271, "learning_rate": 9.336734693877551e-05, "loss": 1.0937, "step": 155 }, { "epoch": 2.8440366972477067, "grad_norm": 0.7117604613304138, "learning_rate": 9.183673469387755e-05, "loss": 0.9225, "step": 156 }, { "epoch": 2.8623853211009176, "grad_norm": 0.8142116665840149, "learning_rate": 9.030612244897958e-05, "loss": 0.9156, "step": 157 }, { "epoch": 2.8807339449541285, "grad_norm": 1.2077499628067017, "learning_rate": 8.877551020408162e-05, "loss": 1.0157, "step": 158 }, { "epoch": 2.8990825688073394, "grad_norm": 1.0128841400146484, "learning_rate": 8.724489795918366e-05, "loss": 1.0673, "step": 159 }, { "epoch": 2.9174311926605503, "grad_norm": 0.8075226545333862, "learning_rate": 8.57142857142857e-05, "loss": 1.0937, "step": 160 }, { "epoch": 2.9357798165137616, "grad_norm": 0.7399254441261292, "learning_rate": 8.418367346938774e-05, "loss": 0.8754, "step": 161 }, { "epoch": 2.9541284403669725, "grad_norm": 0.668204665184021, "learning_rate": 8.265306122448978e-05, "loss": 0.9136, "step": 162 }, { "epoch": 2.9724770642201834, "grad_norm": 0.7688079476356506, "learning_rate": 8.112244897959184e-05, "loss": 1.0803, "step": 163 }, { "epoch": 2.9908256880733948, "grad_norm": 0.6198843717575073, "learning_rate": 7.959183673469388e-05, "loss": 0.8621, "step": 164 }, { "epoch": 3.0, "grad_norm": 1.3351805210113525, "learning_rate": 7.806122448979592e-05, "loss": 1.1852, "step": 165 }, { "epoch": 3.0, "eval_loss": 1.7442647218704224, "eval_runtime": 38.7912, "eval_samples_per_second": 8.559, "eval_steps_per_second": 4.279, "step": 165 }, { "epoch": 3.018348623853211, "grad_norm": 1.8373321294784546, "learning_rate": 7.653061224489796e-05, "loss": 0.7368, "step": 166 }, { "epoch": 3.036697247706422, "grad_norm": 0.979532778263092, "learning_rate": 7.5e-05, "loss": 0.7749, "step": 167 }, { "epoch": 3.055045871559633, "grad_norm": 0.7946708798408508, "learning_rate": 7.346938775510203e-05, "loss": 0.9788, "step": 168 }, { "epoch": 3.073394495412844, "grad_norm": 0.9379284381866455, "learning_rate": 7.193877551020407e-05, "loss": 0.8699, "step": 169 }, { "epoch": 3.091743119266055, "grad_norm": 0.6809526085853577, "learning_rate": 7.040816326530612e-05, "loss": 1.106, "step": 170 }, { "epoch": 3.1100917431192663, "grad_norm": 0.9983639717102051, "learning_rate": 6.887755102040816e-05, "loss": 0.7375, "step": 171 }, { "epoch": 3.128440366972477, "grad_norm": 0.6771747469902039, "learning_rate": 6.73469387755102e-05, "loss": 1.0152, "step": 172 }, { "epoch": 3.146788990825688, "grad_norm": 0.7562909722328186, "learning_rate": 6.581632653061224e-05, "loss": 0.9061, "step": 173 }, { "epoch": 3.165137614678899, "grad_norm": 2.1360785961151123, "learning_rate": 6.428571428571427e-05, "loss": 0.9759, "step": 174 }, { "epoch": 3.18348623853211, "grad_norm": 0.8223655223846436, "learning_rate": 6.275510204081633e-05, "loss": 0.9013, "step": 175 }, { "epoch": 3.2018348623853212, "grad_norm": 0.639680802822113, "learning_rate": 6.122448979591836e-05, "loss": 1.0196, "step": 176 }, { "epoch": 3.220183486238532, "grad_norm": 0.7304046750068665, "learning_rate": 5.96938775510204e-05, "loss": 0.9197, "step": 177 }, { "epoch": 3.238532110091743, "grad_norm": 0.7084257006645203, "learning_rate": 5.816326530612244e-05, "loss": 0.787, "step": 178 }, { "epoch": 3.2568807339449544, "grad_norm": 0.6450588703155518, "learning_rate": 5.663265306122448e-05, "loss": 0.8951, "step": 179 }, { "epoch": 3.2752293577981653, "grad_norm": 0.584988534450531, "learning_rate": 5.510204081632653e-05, "loss": 0.7928, "step": 180 }, { "epoch": 3.293577981651376, "grad_norm": 0.6551803946495056, "learning_rate": 5.357142857142857e-05, "loss": 0.892, "step": 181 }, { "epoch": 3.311926605504587, "grad_norm": 0.6591671109199524, "learning_rate": 5.204081632653061e-05, "loss": 0.723, "step": 182 }, { "epoch": 3.330275229357798, "grad_norm": 0.7666569352149963, "learning_rate": 5.0510204081632645e-05, "loss": 0.8426, "step": 183 }, { "epoch": 3.3486238532110093, "grad_norm": 0.649147629737854, "learning_rate": 4.897959183673468e-05, "loss": 0.8232, "step": 184 }, { "epoch": 3.36697247706422, "grad_norm": 0.9067021012306213, "learning_rate": 4.7448979591836735e-05, "loss": 0.9875, "step": 185 }, { "epoch": 3.385321100917431, "grad_norm": 0.7133729457855225, "learning_rate": 4.591836734693877e-05, "loss": 1.1128, "step": 186 }, { "epoch": 3.4036697247706424, "grad_norm": 0.5974346399307251, "learning_rate": 4.438775510204081e-05, "loss": 0.6996, "step": 187 }, { "epoch": 3.4220183486238533, "grad_norm": 0.9456189870834351, "learning_rate": 4.285714285714285e-05, "loss": 0.9335, "step": 188 }, { "epoch": 3.4403669724770642, "grad_norm": 0.7025442719459534, "learning_rate": 4.132653061224489e-05, "loss": 0.9554, "step": 189 }, { "epoch": 3.458715596330275, "grad_norm": 0.6577914357185364, "learning_rate": 3.979591836734694e-05, "loss": 0.7704, "step": 190 }, { "epoch": 3.477064220183486, "grad_norm": 0.6810734868049622, "learning_rate": 3.826530612244898e-05, "loss": 0.9259, "step": 191 }, { "epoch": 3.4954128440366974, "grad_norm": 0.5843957662582397, "learning_rate": 3.6734693877551016e-05, "loss": 0.8781, "step": 192 }, { "epoch": 3.5137614678899083, "grad_norm": 0.711065948009491, "learning_rate": 3.520408163265306e-05, "loss": 0.6856, "step": 193 }, { "epoch": 3.532110091743119, "grad_norm": 0.5948845744132996, "learning_rate": 3.36734693877551e-05, "loss": 0.8639, "step": 194 }, { "epoch": 3.5504587155963305, "grad_norm": 1.0125257968902588, "learning_rate": 3.214285714285714e-05, "loss": 0.9718, "step": 195 }, { "epoch": 3.5688073394495414, "grad_norm": 0.7652119994163513, "learning_rate": 3.061224489795918e-05, "loss": 0.7567, "step": 196 }, { "epoch": 3.5871559633027523, "grad_norm": 0.6125350594520569, "learning_rate": 2.908163265306122e-05, "loss": 0.8161, "step": 197 }, { "epoch": 3.6055045871559632, "grad_norm": 0.6320289373397827, "learning_rate": 2.7551020408163265e-05, "loss": 0.8912, "step": 198 }, { "epoch": 3.623853211009174, "grad_norm": 0.6471022367477417, "learning_rate": 2.6020408163265303e-05, "loss": 0.9658, "step": 199 }, { "epoch": 3.6422018348623855, "grad_norm": 0.6660200953483582, "learning_rate": 2.448979591836734e-05, "loss": 0.7232, "step": 200 }, { "epoch": 3.6605504587155964, "grad_norm": 0.6752535700798035, "learning_rate": 2.2959183673469387e-05, "loss": 0.8049, "step": 201 }, { "epoch": 3.6788990825688073, "grad_norm": 0.7830822467803955, "learning_rate": 2.1428571428571425e-05, "loss": 0.8195, "step": 202 }, { "epoch": 3.6972477064220186, "grad_norm": 0.7607069611549377, "learning_rate": 1.989795918367347e-05, "loss": 0.7989, "step": 203 }, { "epoch": 3.7155963302752295, "grad_norm": 0.6341504454612732, "learning_rate": 1.8367346938775508e-05, "loss": 1.1087, "step": 204 }, { "epoch": 3.7339449541284404, "grad_norm": 0.8076742887496948, "learning_rate": 1.683673469387755e-05, "loss": 0.6579, "step": 205 }, { "epoch": 3.7522935779816513, "grad_norm": 0.6859648823738098, "learning_rate": 1.530612244897959e-05, "loss": 0.9131, "step": 206 }, { "epoch": 3.770642201834862, "grad_norm": 0.7903756499290466, "learning_rate": 1.3775510204081633e-05, "loss": 0.8616, "step": 207 }, { "epoch": 3.7889908256880735, "grad_norm": 0.7656455039978027, "learning_rate": 1.224489795918367e-05, "loss": 0.9456, "step": 208 }, { "epoch": 3.8073394495412844, "grad_norm": 0.6473223567008972, "learning_rate": 1.0714285714285712e-05, "loss": 0.7096, "step": 209 }, { "epoch": 3.8256880733944953, "grad_norm": 0.7863070368766785, "learning_rate": 9.183673469387754e-06, "loss": 0.8318, "step": 210 }, { "epoch": 3.8440366972477067, "grad_norm": 0.6368746161460876, "learning_rate": 7.653061224489796e-06, "loss": 0.6863, "step": 211 }, { "epoch": 3.8623853211009176, "grad_norm": 0.5635493993759155, "learning_rate": 6.122448979591835e-06, "loss": 0.8106, "step": 212 }, { "epoch": 3.8807339449541285, "grad_norm": 0.6674237251281738, "learning_rate": 4.591836734693877e-06, "loss": 0.819, "step": 213 }, { "epoch": 3.8990825688073394, "grad_norm": 0.5791789293289185, "learning_rate": 3.0612244897959177e-06, "loss": 0.7863, "step": 214 }, { "epoch": 3.9174311926605503, "grad_norm": 0.5922563672065735, "learning_rate": 1.5306122448979589e-06, "loss": 0.7798, "step": 215 }, { "epoch": 3.9357798165137616, "grad_norm": 0.5897291898727417, "learning_rate": 0.0, "loss": 0.8527, "step": 216 }, { "epoch": 3.9357798165137616, "eval_loss": 1.7457518577575684, "eval_runtime": 37.6339, "eval_samples_per_second": 8.822, "eval_steps_per_second": 4.411, "step": 216 } ], "logging_steps": 1, "max_steps": 216, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5619644304021094e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }