{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5712596162965194, "eval_steps": 5000, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014281490407412986, "grad_norm": 49.5, "learning_rate": 4.285714285714285e-05, "logits/chosen": -2.928096294403076, "logits/rejected": -2.9280941486358643, "logps/chosen": -3.81945538520813, "logps/rejected": -4.2658491134643555, "loss": 3.8875, "odds_ratio_loss": 7.880957126617432, "rewards/accuracies": 0.641796886920929, "rewards/chosen": -0.38194555044174194, "rewards/margins": 0.044639457017183304, "rewards/rejected": -0.42658501863479614, "sft_loss": 3.099365711212158, "step": 10 }, { "epoch": 0.028562980814825972, "grad_norm": 9.4375, "learning_rate": 8.57142857142857e-05, "logits/chosen": -3.5747146606445312, "logits/rejected": -3.5747463703155518, "logps/chosen": -1.0031383037567139, "logps/rejected": -1.795668601989746, "loss": 1.0575, "odds_ratio_loss": 3.653076171875, "rewards/accuracies": 0.7490234375, "rewards/chosen": -0.10031384229660034, "rewards/margins": 0.07925303280353546, "rewards/rejected": -0.1795668601989746, "sft_loss": 0.6921551823616028, "step": 20 }, { "epoch": 0.04284447122223896, "grad_norm": 6.375, "learning_rate": 0.00012857142857142855, "logits/chosen": -4.056135177612305, "logits/rejected": -4.05615234375, "logps/chosen": -0.7499507665634155, "logps/rejected": -1.9470783472061157, "loss": 0.7945, "odds_ratio_loss": 2.919811248779297, "rewards/accuracies": 0.791210949420929, "rewards/chosen": -0.07499508559703827, "rewards/margins": 0.11971275508403778, "rewards/rejected": -0.19470782577991486, "sft_loss": 0.5025397539138794, "step": 30 }, { "epoch": 0.057125961629651945, "grad_norm": 6.4375, "learning_rate": 0.0001714285714285714, "logits/chosen": -3.9972000122070312, "logits/rejected": -3.997206211090088, "logps/chosen": -0.6447885632514954, "logps/rejected": -2.2042269706726074, "loss": 0.6819, "odds_ratio_loss": 2.3861823081970215, "rewards/accuracies": 0.8414062261581421, "rewards/chosen": -0.06447885185480118, "rewards/margins": 0.1559438556432724, "rewards/rejected": -0.22042270004749298, "sft_loss": 0.4432622492313385, "step": 40 }, { "epoch": 0.07140745203706493, "grad_norm": 2.53125, "learning_rate": 0.00021428571428571427, "logits/chosen": -3.6315770149230957, "logits/rejected": -3.631592273712158, "logps/chosen": -0.5242542028427124, "logps/rejected": -2.490429401397705, "loss": 0.5524, "odds_ratio_loss": 1.806905746459961, "rewards/accuracies": 0.887890636920929, "rewards/chosen": -0.05242542549967766, "rewards/margins": 0.1966175138950348, "rewards/rejected": -0.24904294312000275, "sft_loss": 0.37168318033218384, "step": 50 }, { "epoch": 0.08568894244447792, "grad_norm": 242.0, "learning_rate": 0.0002571428571428571, "logits/chosen": -3.232466459274292, "logits/rejected": -3.2325031757354736, "logps/chosen": -0.8311947584152222, "logps/rejected": -2.9961485862731934, "loss": 0.8575, "odds_ratio_loss": 3.2764008045196533, "rewards/accuracies": 0.8935546875, "rewards/chosen": -0.0831194818019867, "rewards/margins": 0.21649539470672607, "rewards/rejected": -0.2996148467063904, "sft_loss": 0.5298588275909424, "step": 60 }, { "epoch": 0.0999704328518909, "grad_norm": 5.65625, "learning_rate": 0.0003, "logits/chosen": -2.657118558883667, "logits/rejected": -2.6572413444519043, "logps/chosen": -0.6468337774276733, "logps/rejected": -2.5454134941101074, "loss": 0.6815, "odds_ratio_loss": 2.252271890640259, "rewards/accuracies": 0.869921863079071, "rewards/chosen": -0.06468339264392853, "rewards/margins": 0.18985795974731445, "rewards/rejected": -0.2545413374900818, "sft_loss": 0.4562531113624573, "step": 70 }, { "epoch": 0.11425192325930389, "grad_norm": 4.25, "learning_rate": 0.0002998135381828383, "logits/chosen": -2.8170955181121826, "logits/rejected": -2.8171167373657227, "logps/chosen": -0.4997388422489166, "logps/rejected": -2.737879514694214, "loss": 0.5264, "odds_ratio_loss": 1.6635347604751587, "rewards/accuracies": 0.8994140625, "rewards/chosen": -0.04997389018535614, "rewards/margins": 0.22381405532360077, "rewards/rejected": -0.2737879753112793, "sft_loss": 0.3600570261478424, "step": 80 }, { "epoch": 0.12853341366671686, "grad_norm": 3.671875, "learning_rate": 0.0002992546163048102, "logits/chosen": -3.062329053878784, "logits/rejected": -3.0623464584350586, "logps/chosen": -0.4833168089389801, "logps/rejected": -2.676713466644287, "loss": 0.5082, "odds_ratio_loss": 1.625109314918518, "rewards/accuracies": 0.9037109613418579, "rewards/chosen": -0.048331670463085175, "rewards/margins": 0.21933968365192413, "rewards/rejected": -0.2676713764667511, "sft_loss": 0.3456498384475708, "step": 90 }, { "epoch": 0.14281490407412986, "grad_norm": 3.828125, "learning_rate": 0.0002983246239337692, "logits/chosen": -2.9990651607513428, "logits/rejected": -2.9990792274475098, "logps/chosen": -0.45154619216918945, "logps/rejected": -2.796322822570801, "loss": 0.4752, "odds_ratio_loss": 1.524524450302124, "rewards/accuracies": 0.907421886920929, "rewards/chosen": -0.04515461623668671, "rewards/margins": 0.2344777137041092, "rewards/rejected": -0.2796323001384735, "sft_loss": 0.3227214813232422, "step": 100 }, { "epoch": 0.15709639448154283, "grad_norm": 4.625, "learning_rate": 0.00029702587317728153, "logits/chosen": -3.0073421001434326, "logits/rejected": -3.0073623657226562, "logps/chosen": -0.4845556318759918, "logps/rejected": -2.689493417739868, "loss": 0.5115, "odds_ratio_loss": 1.6119966506958008, "rewards/accuracies": 0.8880859613418579, "rewards/chosen": -0.04845556616783142, "rewards/margins": 0.22049376368522644, "rewards/rejected": -0.26894932985305786, "sft_loss": 0.3503072261810303, "step": 110 }, { "epoch": 0.17137788488895583, "grad_norm": 4.25, "learning_rate": 0.00029536159293436166, "logits/chosen": -3.0959103107452393, "logits/rejected": -3.095935583114624, "logps/chosen": -0.4538491368293762, "logps/rejected": -2.8936073780059814, "loss": 0.4767, "odds_ratio_loss": 1.5220377445220947, "rewards/accuracies": 0.9091796875, "rewards/chosen": -0.04538491368293762, "rewards/margins": 0.24397583305835724, "rewards/rejected": -0.28936073184013367, "sft_loss": 0.32447534799575806, "step": 120 }, { "epoch": 0.1856593752963688, "grad_norm": 3.015625, "learning_rate": 0.00029333592086792107, "logits/chosen": -3.2102882862091064, "logits/rejected": -3.210312604904175, "logps/chosen": -0.4560007154941559, "logps/rejected": -2.9516916275024414, "loss": 0.4775, "odds_ratio_loss": 1.4568021297454834, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.04560007154941559, "rewards/margins": 0.2495690882205963, "rewards/rejected": -0.2951691746711731, "sft_loss": 0.33186882734298706, "step": 130 }, { "epoch": 0.1999408657037818, "grad_norm": 4.625, "learning_rate": 0.0002909538931178862, "logits/chosen": -3.1817660331726074, "logits/rejected": -3.181790828704834, "logps/chosen": -0.4694591164588928, "logps/rejected": -2.694068193435669, "loss": 0.4932, "odds_ratio_loss": 1.5894033908843994, "rewards/accuracies": 0.903515636920929, "rewards/chosen": -0.046945907175540924, "rewards/margins": 0.22246094048023224, "rewards/rejected": -0.26940685510635376, "sft_loss": 0.3343026041984558, "step": 140 }, { "epoch": 0.21422235611119478, "grad_norm": 2.109375, "learning_rate": 0.00028822143178056114, "logits/chosen": -3.19804310798645, "logits/rejected": -3.1980957984924316, "logps/chosen": -0.4429679811000824, "logps/rejected": -2.8848023414611816, "loss": 0.4646, "odds_ratio_loss": 1.4565680027008057, "rewards/accuracies": 0.9175781011581421, "rewards/chosen": -0.04429679363965988, "rewards/margins": 0.24418342113494873, "rewards/rejected": -0.2884802222251892, "sft_loss": 0.31895238161087036, "step": 150 }, { "epoch": 0.22850384651860778, "grad_norm": 2.53125, "learning_rate": 0.0002851453301853628, "logits/chosen": -3.1286864280700684, "logits/rejected": -3.1287217140197754, "logps/chosen": -0.4620634913444519, "logps/rejected": -2.8395490646362305, "loss": 0.4861, "odds_ratio_loss": 1.5178568363189697, "rewards/accuracies": 0.8980468511581421, "rewards/chosen": -0.04620635136961937, "rewards/margins": 0.23774857819080353, "rewards/rejected": -0.283954918384552, "sft_loss": 0.33431634306907654, "step": 160 }, { "epoch": 0.24278533692602075, "grad_norm": 3.8125, "learning_rate": 0.0002817332360055343, "logits/chosen": -3.0237438678741455, "logits/rejected": -3.0237746238708496, "logps/chosen": -0.4375666677951813, "logps/rejected": -2.892333507537842, "loss": 0.4602, "odds_ratio_loss": 1.4504070281982422, "rewards/accuracies": 0.9097656011581421, "rewards/chosen": -0.04375666379928589, "rewards/margins": 0.24547667801380157, "rewards/rejected": -0.28923335671424866, "sft_loss": 0.31515270471572876, "step": 170 }, { "epoch": 0.2570668273334337, "grad_norm": 2.625, "learning_rate": 0.0002779936322448233, "logits/chosen": -3.0108470916748047, "logits/rejected": -3.0108840465545654, "logps/chosen": -0.43512678146362305, "logps/rejected": -3.0354106426239014, "loss": 0.4572, "odds_ratio_loss": 1.3499114513397217, "rewards/accuracies": 0.9091796875, "rewards/chosen": -0.043512679636478424, "rewards/margins": 0.2600283920764923, "rewards/rejected": -0.30354106426239014, "sft_loss": 0.32224926352500916, "step": 180 }, { "epoch": 0.2713483177408467, "grad_norm": 2.78125, "learning_rate": 0.00027393581614739923, "logits/chosen": -3.0553345680236816, "logits/rejected": -3.055368423461914, "logps/chosen": -0.42374086380004883, "logps/rejected": -2.915168523788452, "loss": 0.445, "odds_ratio_loss": 1.4213359355926514, "rewards/accuracies": 0.9175781011581421, "rewards/chosen": -0.042374081909656525, "rewards/margins": 0.24914276599884033, "rewards/rejected": -0.29151684045791626, "sft_loss": 0.3028421401977539, "step": 190 }, { "epoch": 0.2856298081482597, "grad_norm": 2.421875, "learning_rate": 0.0002695698760834384, "logits/chosen": -2.994476318359375, "logits/rejected": -2.9945011138916016, "logps/chosen": -0.4544607102870941, "logps/rejected": -2.8547749519348145, "loss": 0.4776, "odds_ratio_loss": 1.4777902364730835, "rewards/accuracies": 0.9033203125, "rewards/chosen": -0.04544607177376747, "rewards/margins": 0.240031436085701, "rewards/rejected": -0.28547748923301697, "sft_loss": 0.32984623312950134, "step": 200 }, { "epoch": 0.2999112985556727, "grad_norm": 3.546875, "learning_rate": 0.00026490666646784665, "logits/chosen": -3.063324451446533, "logits/rejected": -3.063349723815918, "logps/chosen": -0.43639254570007324, "logps/rejected": -3.105325698852539, "loss": 0.4578, "odds_ratio_loss": 1.3640453815460205, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.04363925755023956, "rewards/margins": 0.26689332723617554, "rewards/rejected": -0.3105325698852539, "sft_loss": 0.3214019536972046, "step": 210 }, { "epoch": 0.31419278896308567, "grad_norm": 2.296875, "learning_rate": 0.0002599577807744739, "logits/chosen": -3.115455389022827, "logits/rejected": -3.1154801845550537, "logps/chosen": -0.4168368875980377, "logps/rejected": -3.096985340118408, "loss": 0.4371, "odds_ratio_loss": 1.3493207693099976, "rewards/accuracies": 0.9164062738418579, "rewards/chosen": -0.04168368875980377, "rewards/margins": 0.2680148482322693, "rewards/rejected": -0.30969855189323425, "sft_loss": 0.3021194040775299, "step": 220 }, { "epoch": 0.32847427937049867, "grad_norm": 1.921875, "learning_rate": 0.0002547355227129109, "logits/chosen": -3.162436008453369, "logits/rejected": -3.1624579429626465, "logps/chosen": -0.4433667063713074, "logps/rejected": -3.0064072608947754, "loss": 0.4655, "odds_ratio_loss": 1.3855293989181519, "rewards/accuracies": 0.912304699420929, "rewards/chosen": -0.04433666914701462, "rewards/margins": 0.2563040852546692, "rewards/rejected": -0.3006407618522644, "sft_loss": 0.3269914984703064, "step": 230 }, { "epoch": 0.34275576977791167, "grad_norm": 1.2734375, "learning_rate": 0.0002492528756395289, "logits/chosen": -3.1590659618377686, "logits/rejected": -3.1590869426727295, "logps/chosen": -0.4178268015384674, "logps/rejected": -3.0285942554473877, "loss": 0.4376, "odds_ratio_loss": 1.3180664777755737, "rewards/accuracies": 0.917187511920929, "rewards/chosen": -0.0417826846241951, "rewards/margins": 0.26107674837112427, "rewards/rejected": -0.30285942554473877, "sft_loss": 0.3058391213417053, "step": 240 }, { "epoch": 0.35703726018532467, "grad_norm": 4.125, "learning_rate": 0.00024352347027881003, "logits/chosen": -3.2828221321105957, "logits/rejected": -3.282838821411133, "logps/chosen": -0.4194249212741852, "logps/rejected": -3.0415470600128174, "loss": 0.4403, "odds_ratio_loss": 1.3526116609573364, "rewards/accuracies": 0.9166015386581421, "rewards/chosen": -0.041942495852708817, "rewards/margins": 0.26221221685409546, "rewards/rejected": -0.3041546940803528, "sft_loss": 0.3050472140312195, "step": 250 }, { "epoch": 0.3713187505927376, "grad_norm": 2.640625, "learning_rate": 0.00023756155083521846, "logits/chosen": -3.22637677192688, "logits/rejected": -3.2263927459716797, "logps/chosen": -0.41995421051979065, "logps/rejected": -3.0328941345214844, "loss": 0.4417, "odds_ratio_loss": 1.3745537996292114, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.04199542477726936, "rewards/margins": 0.26129403710365295, "rewards/rejected": -0.3032894432544708, "sft_loss": 0.30421775579452515, "step": 260 }, { "epoch": 0.3856002410001506, "grad_norm": 2.578125, "learning_rate": 0.0002313819395798639, "logits/chosen": -3.186093330383301, "logits/rejected": -3.1861069202423096, "logps/chosen": -0.4216841161251068, "logps/rejected": -3.125148057937622, "loss": 0.4439, "odds_ratio_loss": 1.3766355514526367, "rewards/accuracies": 0.913281261920929, "rewards/chosen": -0.04216841608285904, "rewards/margins": 0.27034634351730347, "rewards/rejected": -0.3125148117542267, "sft_loss": 0.30621883273124695, "step": 270 }, { "epoch": 0.3998817314075636, "grad_norm": 2.03125, "learning_rate": 0.000225, "logits/chosen": -3.167736530303955, "logits/rejected": -3.167752265930176, "logps/chosen": -0.42215681076049805, "logps/rejected": -3.0445384979248047, "loss": 0.4415, "odds_ratio_loss": 1.3319975137710571, "rewards/accuracies": 0.9189453125, "rewards/chosen": -0.04221567511558533, "rewards/margins": 0.26223814487457275, "rewards/rejected": -0.30445384979248047, "sft_loss": 0.3082923889160156, "step": 280 }, { "epoch": 0.4141632218149766, "grad_norm": 1.9140625, "learning_rate": 0.00021843159860297442, "logits/chosen": -3.2080886363983154, "logits/rejected": -3.208108425140381, "logps/chosen": -0.3955201208591461, "logps/rejected": -3.1413300037384033, "loss": 0.4162, "odds_ratio_loss": 1.298853874206543, "rewards/accuracies": 0.9166015386581421, "rewards/chosen": -0.03955201059579849, "rewards/margins": 0.27458101511001587, "rewards/rejected": -0.3141329884529114, "sft_loss": 0.28630274534225464, "step": 290 }, { "epoch": 0.42844471222238956, "grad_norm": 2.296875, "learning_rate": 0.00021169306546959174, "logits/chosen": -3.1625964641571045, "logits/rejected": -3.1626217365264893, "logps/chosen": -0.4014604091644287, "logps/rejected": -3.231706142425537, "loss": 0.4211, "odds_ratio_loss": 1.2652801275253296, "rewards/accuracies": 0.91796875, "rewards/chosen": -0.04014604538679123, "rewards/margins": 0.2830246090888977, "rewards/rejected": -0.32317066192626953, "sft_loss": 0.2946000099182129, "step": 300 }, { "epoch": 0.44272620262980256, "grad_norm": 1.9765625, "learning_rate": 0.00020480115365495926, "logits/chosen": -3.1747231483459473, "logits/rejected": -3.1747519969940186, "logps/chosen": -0.3960801064968109, "logps/rejected": -3.082359790802002, "loss": 0.4173, "odds_ratio_loss": 1.3159233331680298, "rewards/accuracies": 0.9173828363418579, "rewards/chosen": -0.03960801288485527, "rewards/margins": 0.26862797141075134, "rewards/rejected": -0.30823594331741333, "sft_loss": 0.2857065200805664, "step": 310 }, { "epoch": 0.45700769303721556, "grad_norm": 2.90625, "learning_rate": 0.00019777299753775265, "logits/chosen": -3.2027382850646973, "logits/rejected": -3.202775478363037, "logps/chosen": -0.3917561173439026, "logps/rejected": -3.128166675567627, "loss": 0.4113, "odds_ratio_loss": 1.213888168334961, "rewards/accuracies": 0.923046886920929, "rewards/chosen": -0.0391756072640419, "rewards/margins": 0.27364104986190796, "rewards/rejected": -0.31281667947769165, "sft_loss": 0.28995418548583984, "step": 320 }, { "epoch": 0.47128918344462856, "grad_norm": 1.1171875, "learning_rate": 0.00019062607022145078, "logits/chosen": -3.223431348800659, "logits/rejected": -3.2234749794006348, "logps/chosen": -0.3959726393222809, "logps/rejected": -3.1301980018615723, "loss": 0.4159, "odds_ratio_loss": 1.2752103805541992, "rewards/accuracies": 0.9134765863418579, "rewards/chosen": -0.03959726542234421, "rewards/margins": 0.2734225392341614, "rewards/rejected": -0.3130198121070862, "sft_loss": 0.2884255647659302, "step": 330 }, { "epoch": 0.4855706738520415, "grad_norm": 2.421875, "learning_rate": 0.00018337814009344714, "logits/chosen": -3.229165554046631, "logits/rejected": -3.22920298576355, "logps/chosen": -0.40863022208213806, "logps/rejected": -3.2035133838653564, "loss": 0.4293, "odds_ratio_loss": 1.3022868633270264, "rewards/accuracies": 0.923046886920929, "rewards/chosen": -0.040863025933504105, "rewards/margins": 0.27948835492134094, "rewards/rejected": -0.32035139203071594, "sft_loss": 0.29906368255615234, "step": 340 }, { "epoch": 0.4998521642594545, "grad_norm": 1.6953125, "learning_rate": 0.00017604722665003956, "logits/chosen": -3.268237590789795, "logits/rejected": -3.268270969390869, "logps/chosen": -0.3820918798446655, "logps/rejected": -3.3142802715301514, "loss": 0.4012, "odds_ratio_loss": 1.2229855060577393, "rewards/accuracies": 0.924023449420929, "rewards/chosen": -0.03820918872952461, "rewards/margins": 0.29321882128715515, "rewards/rejected": -0.33142799139022827, "sft_loss": 0.278933584690094, "step": 350 }, { "epoch": 0.5141336546668674, "grad_norm": 2.375, "learning_rate": 0.00016865155569712278, "logits/chosen": -3.3011035919189453, "logits/rejected": -3.3011412620544434, "logps/chosen": -0.38040798902511597, "logps/rejected": -3.2756595611572266, "loss": 0.3985, "odds_ratio_loss": 1.1769336462020874, "rewards/accuracies": 0.9302734136581421, "rewards/chosen": -0.03804079815745354, "rewards/margins": 0.2895251214504242, "rewards/rejected": -0.3275659680366516, "sft_loss": 0.28078263998031616, "step": 360 }, { "epoch": 0.5284151450742804, "grad_norm": 1.484375, "learning_rate": 0.00016120951403796364, "logits/chosen": -3.336045026779175, "logits/rejected": -3.3360836505889893, "logps/chosen": -0.3747532069683075, "logps/rejected": -3.3229317665100098, "loss": 0.3938, "odds_ratio_loss": 1.2170263528823853, "rewards/accuracies": 0.924023449420929, "rewards/chosen": -0.03747531771659851, "rewards/margins": 0.29481783509254456, "rewards/rejected": -0.33229315280914307, "sft_loss": 0.2720716595649719, "step": 370 }, { "epoch": 0.5426966354816934, "grad_norm": 1.1953125, "learning_rate": 0.00015373960376071093, "logits/chosen": -3.3047919273376465, "logits/rejected": -3.3048160076141357, "logps/chosen": -0.37664586305618286, "logps/rejected": -3.1220898628234863, "loss": 0.3961, "odds_ratio_loss": 1.2487261295318604, "rewards/accuracies": 0.921875, "rewards/chosen": -0.037664588540792465, "rewards/margins": 0.2745443880558014, "rewards/rejected": -0.31220895051956177, "sft_loss": 0.2712169289588928, "step": 380 }, { "epoch": 0.5569781258891064, "grad_norm": 2.015625, "learning_rate": 0.00014626039623928907, "logits/chosen": -3.3392891883850098, "logits/rejected": -3.339310884475708, "logps/chosen": -0.3588925302028656, "logps/rejected": -3.4983272552490234, "loss": 0.3772, "odds_ratio_loss": 1.1625574827194214, "rewards/accuracies": 0.927734375, "rewards/chosen": -0.03588924929499626, "rewards/margins": 0.31394344568252563, "rewards/rejected": -0.3498327136039734, "sft_loss": 0.2609647512435913, "step": 390 }, { "epoch": 0.5712596162965194, "grad_norm": 1.5078125, "learning_rate": 0.00013879048596203636, "logits/chosen": -3.382007598876953, "logits/rejected": -3.3820137977600098, "logps/chosen": -0.3696475028991699, "logps/rejected": -3.3737378120422363, "loss": 0.3872, "odds_ratio_loss": 1.1461818218231201, "rewards/accuracies": 0.929882824420929, "rewards/chosen": -0.03696475178003311, "rewards/margins": 0.3004090189933777, "rewards/rejected": -0.3373737931251526, "sft_loss": 0.27257078886032104, "step": 400 } ], "logging_steps": 10, "max_steps": 700, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.1756323933484155e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }